blob: 2a6a087bd11f8d3f95c4c7fa82b11df7cc5c2e0e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
120 PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000122
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000124PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000126#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127 return 0x10FFFF;
128#else
129 /* This is actually an illegal character, so it should
130 not be passed to unichr. */
131 return 0xFFFF;
132#endif
133}
134
Thomas Wouters477c8d52006-05-27 19:21:47 +0000135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138 to keep things simple, we use a single bitmask, using the least 5
139 bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154 /* calculate simple bloom-style bitmask for a given unicode string */
155
156 long mask;
157 Py_ssize_t i;
158
159 mask = 0;
160 for (i = 0; i < len; i++)
161 mask |= (1 << (ptr[i] & 0x1F));
162
163 return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168 Py_ssize_t i;
169
170 for (i = 0; i < setlen; i++)
171 if (set[i] == chr)
172 return 1;
173
174 return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
Guido van Rossumd57fd912000-03-10 22:53:23 +0000180/* --- Unicode Object ----------------------------------------------------- */
181
182static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000184 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185{
186 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000187
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000191
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192 /* Resizing shared object (unicode_empty or single character
193 objects) in-place is not allowed. Use PyUnicode_Resize()
194 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000195
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 /* We allocate one more byte to make sure the string is Ux0000 terminated.
206 The overallocation is also used by fastsearch, which assumes that it's
207 safe to look at str[length] (without making any assumptions about what
208 it contains). */
209
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 oldstr = unicode->str;
211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 PyErr_NoMemory();
215 return -1;
216 }
217 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000222 if (unicode->defenc) {
223 Py_DECREF(unicode->defenc);
224 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000227
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 return 0;
229}
230
231/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000232 Ux0000 terminated; some code (e.g. new_identifier)
233 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234
235 XXX This allocator could further be enhanced by assuring that the
236 free list never reduces its size below 1.
237
238*/
239
240static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
243 register PyUnicodeObject *unicode;
244
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 if (length == 0 && unicode_empty != NULL) {
247 Py_INCREF(unicode_empty);
248 return unicode_empty;
249 }
250
251 /* Unicode freelist & memory allocation */
252 if (unicode_freelist) {
253 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000254 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 /* Keep-Alive optimization: we only upsize the buffer,
258 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000259 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000260 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000267 }
268 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 }
270 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 if (unicode == NULL)
273 return NULL;
274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000292 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000293 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000295
296 onError:
297 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000298 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300}
301
302static
Guido van Rossum9475a232001-10-05 20:51:39 +0000303void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304{
Walter Dörwald16807132007-05-25 13:52:07 +0000305 switch (PyUnicode_CHECK_INTERNED(unicode)) {
306 case SSTATE_NOT_INTERNED:
307 break;
308
309 case SSTATE_INTERNED_MORTAL:
310 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000311 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313 Py_FatalError(
314 "deletion of interned unicode string failed");
315 break;
316
317 case SSTATE_INTERNED_IMMORTAL:
318 Py_FatalError("Immortal interned unicode string died.");
319
320 default:
321 Py_FatalError("Inconsistent interned unicode string state.");
322 }
323
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 if (PyUnicode_CheckExact(unicode) &&
325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000326 /* Keep-Alive optimization */
327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000328 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 unicode->str = NULL;
330 unicode->length = 0;
331 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000332 if (unicode->defenc) {
333 Py_DECREF(unicode->defenc);
334 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000335 }
336 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 *(PyUnicodeObject **)unicode = unicode_freelist;
338 unicode_freelist = unicode;
339 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 }
341 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000342 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000343 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000344 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346}
347
Martin v. Löwis18e16552006-02-15 17:27:45 +0000348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349{
350 register PyUnicodeObject *v;
351
352 /* Argument checks */
353 if (unicode == NULL) {
354 PyErr_BadInternalCall();
355 return -1;
356 }
357 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 PyErr_BadInternalCall();
360 return -1;
361 }
362
363 /* Resizing unicode_empty and single character objects is not
364 possible since these are being shared. We simply return a fresh
365 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000366 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 (v == unicode_empty || v->length == 1)) {
368 PyUnicodeObject *w = _PyUnicode_New(length);
369 if (w == NULL)
370 return -1;
371 Py_UNICODE_COPY(w->str, v->str,
372 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000373 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000374 *unicode = (PyObject *)w;
375 return 0;
376 }
377
378 /* Note that we don't have to modify *unicode for unshared Unicode
379 objects, since we can modify them in-place. */
380 return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
390 PyUnicodeObject *unicode;
391
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000392 /* If the Unicode data is known at construction time, we can apply
393 some optimizations which share commonly used objects. */
394 if (u != NULL) {
395
396 /* Optimization for empty strings */
397 if (size == 0 && unicode_empty != NULL) {
398 Py_INCREF(unicode_empty);
399 return (PyObject *)unicode_empty;
400 }
401
402 /* Single character Unicode objects in the Latin-1 range are
403 shared when using this constructor */
404 if (size == 1 && *u < 256) {
405 unicode = unicode_latin1[*u];
406 if (!unicode) {
407 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 if (!unicode)
409 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000410 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 unicode_latin1[*u] = unicode;
412 }
413 Py_INCREF(unicode);
414 return (PyObject *)unicode;
415 }
416 }
Tim Petersced69f82003-09-16 20:30:58 +0000417
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 unicode = _PyUnicode_New(size);
419 if (!unicode)
420 return NULL;
421
422 /* Copy the Unicode data into the new object */
423 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425
426 return (PyObject *)unicode;
427}
428
Walter Dörwaldd2034312007-05-18 16:29:38 +0000429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430{
431 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000433 some optimizations which share commonly used objects.
434 Also, this means the input must be UTF-8, so fall back to the
435 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 if (u != NULL) {
437
438 /* Optimization for empty strings */
439 if (size == 0 && unicode_empty != NULL) {
440 Py_INCREF(unicode_empty);
441 return (PyObject *)unicode_empty;
442 }
443
Martin v. Löwis9c121062007-08-05 20:26:11 +0000444 /* Single characters are shared when using this constructor.
445 Restrict to ASCII, since the input must be UTF-8. */
446 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000447 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 if (!unicode) {
449 unicode = _PyUnicode_New(1);
450 if (!unicode)
451 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode->str[0] = Py_CHARMASK(*u);
453 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000454 }
455 Py_INCREF(unicode);
456 return (PyObject *)unicode;
457 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000458
459 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 }
461
Walter Dörwald55507312007-05-18 13:12:10 +0000462 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 if (!unicode)
464 return NULL;
465
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000466 return (PyObject *)unicode;
467}
468
Walter Dörwaldd2034312007-05-18 16:29:38 +0000469PyObject *PyUnicode_FromString(const char *u)
470{
471 size_t size = strlen(u);
472 if (size > PY_SSIZE_T_MAX) {
473 PyErr_SetString(PyExc_OverflowError, "input too long");
474 return NULL;
475 }
476
477 return PyUnicode_FromStringAndSize(u, size);
478}
479
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000483 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484{
485 PyUnicodeObject *unicode;
486
487 if (w == NULL) {
488 PyErr_BadInternalCall();
489 return NULL;
490 }
491
492 unicode = _PyUnicode_New(size);
493 if (!unicode)
494 return NULL;
495
496 /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000499#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 {
501 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000504 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 *u++ = *w++;
506 }
507#endif
508
509 return (PyObject *)unicode;
510}
511
Walter Dörwald346737f2007-05-31 10:44:43 +0000512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515 *fmt++ = '%';
516 if (width) {
517 if (zeropad)
518 *fmt++ = '0';
519 fmt += sprintf(fmt, "%d", width);
520 }
521 if (precision)
522 fmt += sprintf(fmt, ".%d", precision);
523 if (longflag)
524 *fmt++ = 'l';
525 else if (size_tflag) {
526 char *f = PY_FORMAT_SIZE_T;
527 while (*f)
528 *fmt++ = *f++;
529 }
530 *fmt++ = c;
531 *fmt = '\0';
532}
533
Walter Dörwaldd2034312007-05-18 16:29:38 +0000534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000540 Py_ssize_t callcount = 0;
541 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000542 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000543 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000544 int width = 0;
545 int precision = 0;
546 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 const char* f;
548 Py_UNICODE *s;
549 PyObject *string;
550 /* used by sprintf */
551 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000552 /* use abuffer instead of buffer, if we need more space
553 * (which can happen if there's a format specifier with width). */
554 char *abuffer = NULL;
555 char *realbuffer;
556 Py_ssize_t abuffersize = 0;
557 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000558 const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561 Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef __va_copy
564 __va_copy(count, vargs);
565#else
566 count = vargs;
567#endif
568#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000569 /* step 1: count the number of %S/%R format specifications
570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000574 ++callcount;
575 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 /* step 2: allocate memory for the results of
577 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 if (callcount) {
579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580 if (!callresults) {
581 PyErr_NoMemory();
582 return NULL;
583 }
584 callresult = callresults;
585 }
586 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000587 for (f = format; *f; f++) {
588 if (*f == '%') {
589 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000590 width = 0;
591 while (isdigit(Py_CHARMASK(*f)))
592 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594 ;
595
596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597 * they don't affect the amount of space we reserve.
598 */
599 if ((*f == 'l' || *f == 'z') &&
600 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000601 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000602
603 switch (*f) {
604 case 'c':
605 (void)va_arg(count, int);
606 /* fall through... */
607 case '%':
608 n++;
609 break;
610 case 'd': case 'u': case 'i': case 'x':
611 (void) va_arg(count, int);
612 /* 20 bytes is enough to hold a 64-bit
613 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000614 This isn't enough for octal.
615 If a width is specified we need more
616 (which we allocate later). */
617 if (width < 20)
618 width = 20;
619 n += width;
620 if (abuffersize < width)
621 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 break;
623 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000624 {
625 /* UTF-8 */
626 unsigned char*s;
627 s = va_arg(count, unsigned char*);
628 while (*s) {
629 if (*s < 128) {
630 n++; s++;
631 } else if (*s < 0xc0) {
632 /* invalid UTF-8 */
633 n++; s++;
634 } else if (*s < 0xc0) {
635 n++;
636 s++; if(!*s)break;
637 s++;
638 } else if (*s < 0xe0) {
639 n++;
640 s++; if(!*s)break;
641 s++; if(!*s)break;
642 s++;
643 } else {
644 #ifdef Py_UNICODE_WIDE
645 n++;
646 #else
647 n+=2;
648 #endif
649 s++; if(!*s)break;
650 s++; if(!*s)break;
651 s++; if(!*s)break;
652 s++;
653 }
654 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000655 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000656 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000657 case 'U':
658 {
659 PyObject *obj = va_arg(count, PyObject *);
660 assert(obj && PyUnicode_Check(obj));
661 n += PyUnicode_GET_SIZE(obj);
662 break;
663 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000664 case 'V':
665 {
666 PyObject *obj = va_arg(count, PyObject *);
667 const char *str = va_arg(count, const char *);
668 assert(obj || str);
669 assert(!obj || PyUnicode_Check(obj));
670 if (obj)
671 n += PyUnicode_GET_SIZE(obj);
672 else
673 n += strlen(str);
674 break;
675 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000676 case 'S':
677 {
678 PyObject *obj = va_arg(count, PyObject *);
679 PyObject *str;
680 assert(obj);
681 str = PyObject_Unicode(obj);
682 if (!str)
683 goto fail;
684 n += PyUnicode_GET_SIZE(str);
685 /* Remember the str and switch to the next slot */
686 *callresult++ = str;
687 break;
688 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000689 case 'R':
690 {
691 PyObject *obj = va_arg(count, PyObject *);
692 PyObject *repr;
693 assert(obj);
694 repr = PyObject_Repr(obj);
695 if (!repr)
696 goto fail;
697 n += PyUnicode_GET_SIZE(repr);
698 /* Remember the repr and switch to the next slot */
699 *callresult++ = repr;
700 break;
701 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000702 case 'p':
703 (void) va_arg(count, int);
704 /* maximum 64-bit pointer representation:
705 * 0xffffffffffffffff
706 * so 19 characters is enough.
707 * XXX I count 18 -- what's the extra for?
708 */
709 n += 19;
710 break;
711 default:
712 /* if we stumble upon an unknown
713 formatting code, copy the rest of
714 the format string to the output
715 string. (we cannot just skip the
716 code, since there's no way to know
717 what's in the argument list) */
718 n += strlen(p);
719 goto expand;
720 }
721 } else
722 n++;
723 }
724 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000725 if (abuffersize > 20) {
726 abuffer = PyMem_Malloc(abuffersize);
727 if (!abuffer) {
728 PyErr_NoMemory();
729 goto fail;
730 }
731 realbuffer = abuffer;
732 }
733 else
734 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000735 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000737 we don't have to resize the string.
738 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000739 string = PyUnicode_FromUnicode(NULL, n);
740 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000741 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000742
743 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000744 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
746 for (f = format; *f; f++) {
747 if (*f == '%') {
748 const char* p = f++;
749 int longflag = 0;
750 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 zeropad = (*f == '0');
752 /* parse the width.precision part */
753 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 width = (width*10) + *f++ - '0';
756 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 if (*f == '.') {
758 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 /* handle the long flag, but only for %ld and %lu.
763 others can be added when necessary. */
764 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
765 longflag = 1;
766 ++f;
767 }
768 /* handle the size_t flag. */
769 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
770 size_tflag = 1;
771 ++f;
772 }
773
774 switch (*f) {
775 case 'c':
776 *s++ = va_arg(vargs, int);
777 break;
778 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000779 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000780 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000781 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000783 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000784 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000785 sprintf(realbuffer, fmt, va_arg(vargs, int));
786 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 break;
788 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000789 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000790 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000791 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000792 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000793 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000795 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
796 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 break;
798 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000799 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
800 sprintf(realbuffer, fmt, va_arg(vargs, int));
801 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 break;
803 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
805 sprintf(realbuffer, fmt, va_arg(vargs, int));
806 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 break;
808 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000809 {
810 /* Parameter must be UTF-8 encoded.
811 In case of encoding errors, use
812 the replacement character. */
813 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000814 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000815 u = PyUnicode_DecodeUTF8(p, strlen(p),
816 "replace");
817 if (!u)
818 goto fail;
819 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
820 PyUnicode_GET_SIZE(u));
821 s += PyUnicode_GET_SIZE(u);
822 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000823 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000824 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000825 case 'U':
826 {
827 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000828 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
829 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
830 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000831 break;
832 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000833 case 'V':
834 {
835 PyObject *obj = va_arg(vargs, PyObject *);
836 const char *str = va_arg(vargs, const char *);
837 if (obj) {
838 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
839 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
840 s += size;
841 } else {
842 appendstring(str);
843 }
844 break;
845 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000846 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000847 case 'R':
848 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000849 Py_UNICODE *ucopy;
850 Py_ssize_t usize;
851 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 /* unused, since we already have the result */
853 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000854 ucopy = PyUnicode_AS_UNICODE(*callresult);
855 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000856 for (upos = 0; upos<usize;)
857 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000858 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000859 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000860 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 ++callresult;
862 break;
863 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000864 case 'p':
865 sprintf(buffer, "%p", va_arg(vargs, void*));
866 /* %p is ill-defined: ensure leading 0x. */
867 if (buffer[1] == 'X')
868 buffer[1] = 'x';
869 else if (buffer[1] != 'x') {
870 memmove(buffer+2, buffer, strlen(buffer)+1);
871 buffer[0] = '0';
872 buffer[1] = 'x';
873 }
874 appendstring(buffer);
875 break;
876 case '%':
877 *s++ = '%';
878 break;
879 default:
880 appendstring(p);
881 goto end;
882 }
883 } else
884 *s++ = *f;
885 }
886
887 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000888 if (callresults)
889 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000890 if (abuffer)
891 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
893 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000894 fail:
895 if (callresults) {
896 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000897 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000898 Py_DECREF(*callresult2);
899 ++callresult2;
900 }
901 PyMem_Free(callresults);
902 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000903 if (abuffer)
904 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000905 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000906}
907
908#undef appendstring
909
910PyObject *
911PyUnicode_FromFormat(const char *format, ...)
912{
913 PyObject* ret;
914 va_list vargs;
915
916#ifdef HAVE_STDARG_PROTOTYPES
917 va_start(vargs, format);
918#else
919 va_start(vargs);
920#endif
921 ret = PyUnicode_FromFormatV(format, vargs);
922 va_end(vargs);
923 return ret;
924}
925
Martin v. Löwis18e16552006-02-15 17:27:45 +0000926Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
927 wchar_t *w,
928 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000929{
930 if (unicode == NULL) {
931 PyErr_BadInternalCall();
932 return -1;
933 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000934
935 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000937 size = PyUnicode_GET_SIZE(unicode) + 1;
938
Guido van Rossumd57fd912000-03-10 22:53:23 +0000939#ifdef HAVE_USABLE_WCHAR_T
940 memcpy(w, unicode->str, size * sizeof(wchar_t));
941#else
942 {
943 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000944 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000945 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000946 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 *w++ = *u++;
948 }
949#endif
950
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000951 if (size > PyUnicode_GET_SIZE(unicode))
952 return PyUnicode_GET_SIZE(unicode);
953 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000954 return size;
955}
956
957#endif
958
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000959PyObject *PyUnicode_FromOrdinal(int ordinal)
960{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000961 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000962
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000963 if (ordinal < 0 || ordinal > 0x10ffff) {
964 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000965 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000966 return NULL;
967 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000968
969#ifndef Py_UNICODE_WIDE
970 if (ordinal > 0xffff) {
971 ordinal -= 0x10000;
972 s[0] = 0xD800 | (ordinal >> 10);
973 s[1] = 0xDC00 | (ordinal & 0x3FF);
974 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000975 }
976#endif
977
Hye-Shik Chang40574832004-04-06 07:24:51 +0000978 s[0] = (Py_UNICODE)ordinal;
979 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000980}
981
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982PyObject *PyUnicode_FromObject(register PyObject *obj)
983{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 /* XXX Perhaps we should make this API an alias of
985 PyObject_Unicode() instead ?! */
986 if (PyUnicode_CheckExact(obj)) {
987 Py_INCREF(obj);
988 return obj;
989 }
990 if (PyUnicode_Check(obj)) {
991 /* For a Unicode subtype that's not a Unicode object,
992 return a true Unicode object with the same data. */
993 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
994 PyUnicode_GET_SIZE(obj));
995 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000996 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
997}
998
999PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1000 const char *encoding,
1001 const char *errors)
1002{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001003 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001005 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001006
Guido van Rossumd57fd912000-03-10 22:53:23 +00001007 if (obj == NULL) {
1008 PyErr_BadInternalCall();
1009 return NULL;
1010 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001011
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001012 if (PyUnicode_Check(obj)) {
1013 PyErr_SetString(PyExc_TypeError,
1014 "decoding Unicode is not supported");
1015 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001017
1018 /* Coerce object */
1019 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001020 s = PyString_AS_STRING(obj);
1021 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001022 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001023 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1024 /* Overwrite the error message with something more useful in
1025 case of a TypeError. */
1026 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001027 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001028 "coercing to Unicode: need string or buffer, "
1029 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001030 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001031 goto onError;
1032 }
Tim Petersced69f82003-09-16 20:30:58 +00001033
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001034 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001035 if (len == 0) {
1036 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001037 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 }
Tim Petersced69f82003-09-16 20:30:58 +00001039 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001040 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001041
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 return v;
1043
1044 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046}
1047
1048PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001049 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050 const char *encoding,
1051 const char *errors)
1052{
1053 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001054
1055 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001056 encoding = PyUnicode_GetDefaultEncoding();
1057
1058 /* Shortcuts for common default encodings */
1059 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001061 else if (strcmp(encoding, "latin-1") == 0)
1062 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001063#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1064 else if (strcmp(encoding, "mbcs") == 0)
1065 return PyUnicode_DecodeMBCS(s, size, errors);
1066#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001067 else if (strcmp(encoding, "ascii") == 0)
1068 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001069
1070 /* Decode via the codec registry */
1071 buffer = PyBuffer_FromMemory((void *)s, size);
1072 if (buffer == NULL)
1073 goto onError;
1074 unicode = PyCodec_Decode(buffer, encoding, errors);
1075 if (unicode == NULL)
1076 goto onError;
1077 if (!PyUnicode_Check(unicode)) {
1078 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001079 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001080 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001081 Py_DECREF(unicode);
1082 goto onError;
1083 }
1084 Py_DECREF(buffer);
1085 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001086
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 onError:
1088 Py_XDECREF(buffer);
1089 return NULL;
1090}
1091
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001092PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1093 const char *encoding,
1094 const char *errors)
1095{
1096 PyObject *v;
1097
1098 if (!PyUnicode_Check(unicode)) {
1099 PyErr_BadArgument();
1100 goto onError;
1101 }
1102
1103 if (encoding == NULL)
1104 encoding = PyUnicode_GetDefaultEncoding();
1105
1106 /* Decode via the codec registry */
1107 v = PyCodec_Decode(unicode, encoding, errors);
1108 if (v == NULL)
1109 goto onError;
1110 return v;
1111
1112 onError:
1113 return NULL;
1114}
1115
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001117 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 const char *encoding,
1119 const char *errors)
1120{
1121 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001122
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 unicode = PyUnicode_FromUnicode(s, size);
1124 if (unicode == NULL)
1125 return NULL;
1126 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1127 Py_DECREF(unicode);
1128 return v;
1129}
1130
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001131PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1132 const char *encoding,
1133 const char *errors)
1134{
1135 PyObject *v;
1136
1137 if (!PyUnicode_Check(unicode)) {
1138 PyErr_BadArgument();
1139 goto onError;
1140 }
1141
1142 if (encoding == NULL)
1143 encoding = PyUnicode_GetDefaultEncoding();
1144
1145 /* Encode via the codec registry */
1146 v = PyCodec_Encode(unicode, encoding, errors);
1147 if (v == NULL)
1148 goto onError;
1149 return v;
1150
1151 onError:
1152 return NULL;
1153}
1154
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1156 const char *encoding,
1157 const char *errors)
1158{
1159 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 if (!PyUnicode_Check(unicode)) {
1162 PyErr_BadArgument();
1163 goto onError;
1164 }
Fred Drakee4315f52000-05-09 19:53:39 +00001165
Tim Petersced69f82003-09-16 20:30:58 +00001166 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001167 encoding = PyUnicode_GetDefaultEncoding();
1168
1169 /* Shortcuts for common default encodings */
1170 if (errors == NULL) {
1171 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001172 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001173 else if (strcmp(encoding, "latin-1") == 0)
1174 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001175#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1176 else if (strcmp(encoding, "mbcs") == 0)
1177 return PyUnicode_AsMBCSString(unicode);
1178#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001179 else if (strcmp(encoding, "ascii") == 0)
1180 return PyUnicode_AsASCIIString(unicode);
1181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182
1183 /* Encode via the codec registry */
1184 v = PyCodec_Encode(unicode, encoding, errors);
1185 if (v == NULL)
1186 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001187 if (!PyBytes_Check(v)) {
1188 if (PyString_Check(v)) {
1189 /* Old codec, turn it into bytes */
1190 PyObject *b = PyBytes_FromObject(v);
1191 Py_DECREF(v);
1192 return b;
1193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001195 "encoder did not return a bytes object "
1196 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1197 v->ob_type->tp_name,
1198 encoding ? encoding : "NULL",
1199 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 Py_DECREF(v);
1201 goto onError;
1202 }
1203 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001204
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 onError:
1206 return NULL;
1207}
1208
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001209PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1210 const char *errors)
1211{
1212 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001213 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001214 if (v)
1215 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001216 if (errors != NULL)
1217 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001218 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1219 PyUnicode_GET_SIZE(unicode),
1220 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001221 if (!b)
1222 return NULL;
1223 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1224 PyBytes_Size(b));
1225 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001226 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001227 return v;
1228}
1229
Martin v. Löwis5b222132007-06-10 09:51:05 +00001230char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001231PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001232{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001233 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadArgument();
1236 return NULL;
1237 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001238 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1239 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001240 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001241 if (psize != NULL)
1242 *psize = PyString_GET_SIZE(str8);
1243 return PyString_AS_STRING(str8);
1244}
1245
1246char*
1247PyUnicode_AsString(PyObject *unicode)
1248{
1249 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001250}
1251
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1253{
1254 if (!PyUnicode_Check(unicode)) {
1255 PyErr_BadArgument();
1256 goto onError;
1257 }
1258 return PyUnicode_AS_UNICODE(unicode);
1259
1260 onError:
1261 return NULL;
1262}
1263
Martin v. Löwis18e16552006-02-15 17:27:45 +00001264Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265{
1266 if (!PyUnicode_Check(unicode)) {
1267 PyErr_BadArgument();
1268 goto onError;
1269 }
1270 return PyUnicode_GET_SIZE(unicode);
1271
1272 onError:
1273 return -1;
1274}
1275
Thomas Wouters78890102000-07-22 19:25:51 +00001276const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001277{
1278 return unicode_default_encoding;
1279}
1280
1281int PyUnicode_SetDefaultEncoding(const char *encoding)
1282{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001283 if (strcmp(encoding, unicode_default_encoding) != 0) {
1284 PyErr_Format(PyExc_ValueError,
1285 "Can only set default encoding to %s",
1286 unicode_default_encoding);
1287 return -1;
1288 }
Fred Drakee4315f52000-05-09 19:53:39 +00001289 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001290}
1291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001292/* error handling callback helper:
1293 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001294 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001295 and adjust various state variables.
1296 return 0 on success, -1 on error
1297*/
1298
1299static
1300int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1301 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001302 const char **input, const char **inend, Py_ssize_t *startinpos,
1303 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001304 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001305{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001306 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001307
1308 PyObject *restuple = NULL;
1309 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001310 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001311 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001312 Py_ssize_t requiredsize;
1313 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001314 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001315 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001316 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 int res = -1;
1318
1319 if (*errorHandler == NULL) {
1320 *errorHandler = PyCodec_LookupError(errors);
1321 if (*errorHandler == NULL)
1322 goto onError;
1323 }
1324
1325 if (*exceptionObject == NULL) {
1326 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001327 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001328 if (*exceptionObject == NULL)
1329 goto onError;
1330 }
1331 else {
1332 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1333 goto onError;
1334 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1335 goto onError;
1336 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1337 goto onError;
1338 }
1339
1340 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1341 if (restuple == NULL)
1342 goto onError;
1343 if (!PyTuple_Check(restuple)) {
1344 PyErr_Format(PyExc_TypeError, &argparse[4]);
1345 goto onError;
1346 }
1347 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1348 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001349
1350 /* Copy back the bytes variables, which might have been modified by the
1351 callback */
1352 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1353 if (!inputobj)
1354 goto onError;
1355 if (!PyBytes_Check(inputobj)) {
1356 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1357 }
1358 *input = PyBytes_AS_STRING(inputobj);
1359 insize = PyBytes_GET_SIZE(inputobj);
1360 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001361 /* we can DECREF safely, as the exception has another reference,
1362 so the object won't go away. */
1363 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001365 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001366 newpos = insize+newpos;
1367 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001368 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001369 goto onError;
1370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371
1372 /* need more space? (at least enough for what we
1373 have+the replacement+the rest of the string (starting
1374 at the new input position), so we won't have to check space
1375 when there are no errors in the rest of the string) */
1376 repptr = PyUnicode_AS_UNICODE(repunicode);
1377 repsize = PyUnicode_GET_SIZE(repunicode);
1378 requiredsize = *outpos + repsize + insize-newpos;
1379 if (requiredsize > outsize) {
1380 if (requiredsize<2*outsize)
1381 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 goto onError;
1384 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1385 }
1386 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001387 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001388 Py_UNICODE_COPY(*outptr, repptr, repsize);
1389 *outptr += repsize;
1390 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001391
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001392 /* we made it! */
1393 res = 0;
1394
1395 onError:
1396 Py_XDECREF(restuple);
1397 return res;
1398}
1399
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001400/* --- UTF-7 Codec -------------------------------------------------------- */
1401
1402/* see RFC2152 for details */
1403
Tim Petersced69f82003-09-16 20:30:58 +00001404static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001405char utf7_special[128] = {
1406 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1407 encoded:
1408 0 - not special
1409 1 - special
1410 2 - whitespace (optional)
1411 3 - RFC2152 Set O (optional) */
1412 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1413 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1414 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1416 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1418 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1420
1421};
1422
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001423/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1424 warnings about the comparison always being false; since
1425 utf7_special[0] is 1, we can safely make that one comparison
1426 true */
1427
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001428#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001429 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001430 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431 (encodeO && (utf7_special[(c)] == 3)))
1432
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001433#define B64(n) \
1434 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1435#define B64CHAR(c) \
1436 (isalnum(c) || (c) == '+' || (c) == '/')
1437#define UB64(c) \
1438 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1439 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001440
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001441#define ENCODE(out, ch, bits) \
1442 while (bits >= 6) { \
1443 *out++ = B64(ch >> (bits-6)); \
1444 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001445 }
1446
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001447#define DECODE(out, ch, bits, surrogate) \
1448 while (bits >= 16) { \
1449 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1450 bits -= 16; \
1451 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001452 /* We have already generated an error for the high surrogate \
1453 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001454 surrogate = 0; \
1455 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001456 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001457 it in a 16-bit character */ \
1458 surrogate = 1; \
1459 errmsg = "code pairs are not supported"; \
1460 goto utf7Error; \
1461 } else { \
1462 *out++ = outCh; \
1463 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001464 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001466PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468 const char *errors)
1469{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001470 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001471 Py_ssize_t startinpos;
1472 Py_ssize_t endinpos;
1473 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001474 const char *e;
1475 PyUnicodeObject *unicode;
1476 Py_UNICODE *p;
1477 const char *errmsg = "";
1478 int inShift = 0;
1479 unsigned int bitsleft = 0;
1480 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001481 int surrogate = 0;
1482 PyObject *errorHandler = NULL;
1483 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001484
1485 unicode = _PyUnicode_New(size);
1486 if (!unicode)
1487 return NULL;
1488 if (size == 0)
1489 return (PyObject *)unicode;
1490
1491 p = unicode->str;
1492 e = s + size;
1493
1494 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001495 Py_UNICODE ch;
1496 restart:
1497 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001498
1499 if (inShift) {
1500 if ((ch == '-') || !B64CHAR(ch)) {
1501 inShift = 0;
1502 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001503
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001504 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1505 if (bitsleft >= 6) {
1506 /* The shift sequence has a partial character in it. If
1507 bitsleft < 6 then we could just classify it as padding
1508 but that is not the case here */
1509
1510 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001511 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512 }
1513 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001514 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001515 here so indicate the potential of a misencoded character. */
1516
1517 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1518 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1519 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001520 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001521 }
1522
1523 if (ch == '-') {
1524 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001525 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 inShift = 1;
1527 }
1528 } else if (SPECIAL(ch,0,0)) {
1529 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001530 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001531 } else {
1532 *p++ = ch;
1533 }
1534 } else {
1535 charsleft = (charsleft << 6) | UB64(ch);
1536 bitsleft += 6;
1537 s++;
1538 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1539 }
1540 }
1541 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 s++;
1544 if (s < e && *s == '-') {
1545 s++;
1546 *p++ = '+';
1547 } else
1548 {
1549 inShift = 1;
1550 bitsleft = 0;
1551 }
1552 }
1553 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001554 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001555 errmsg = "unexpected special character";
1556 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001557 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 }
1559 else {
1560 *p++ = ch;
1561 s++;
1562 }
1563 continue;
1564 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001565 outpos = p-PyUnicode_AS_UNICODE(unicode);
1566 endinpos = s-starts;
1567 if (unicode_decode_call_errorhandler(
1568 errors, &errorHandler,
1569 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001570 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001571 (PyObject **)&unicode, &outpos, &p))
1572 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 }
1574
1575 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001576 outpos = p-PyUnicode_AS_UNICODE(unicode);
1577 endinpos = size;
1578 if (unicode_decode_call_errorhandler(
1579 errors, &errorHandler,
1580 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001581 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001582 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001584 if (s < e)
1585 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001586 }
1587
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001588 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 goto onError;
1590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001591 Py_XDECREF(errorHandler);
1592 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 return (PyObject *)unicode;
1594
1595onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 Py_XDECREF(errorHandler);
1597 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598 Py_DECREF(unicode);
1599 return NULL;
1600}
1601
1602
1603PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001604 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605 int encodeSetO,
1606 int encodeWhiteSpace,
1607 const char *errors)
1608{
1609 PyObject *v;
1610 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001611 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001612 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001613 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001614 unsigned int bitsleft = 0;
1615 unsigned long charsleft = 0;
1616 char * out;
1617 char * start;
1618
1619 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001620 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001621
Walter Dörwald51ab4142007-05-05 14:43:36 +00001622 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623 if (v == NULL)
1624 return NULL;
1625
Walter Dörwald51ab4142007-05-05 14:43:36 +00001626 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 for (;i < size; ++i) {
1628 Py_UNICODE ch = s[i];
1629
1630 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001631 if (ch == '+') {
1632 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001633 *out++ = '-';
1634 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1635 charsleft = ch;
1636 bitsleft = 16;
1637 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001638 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001639 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001640 } else {
1641 *out++ = (char) ch;
1642 }
1643 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001644 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1645 *out++ = B64(charsleft << (6-bitsleft));
1646 charsleft = 0;
1647 bitsleft = 0;
1648 /* Characters not in the BASE64 set implicitly unshift the sequence
1649 so no '-' is required, except if the character is itself a '-' */
1650 if (B64CHAR(ch) || ch == '-') {
1651 *out++ = '-';
1652 }
1653 inShift = 0;
1654 *out++ = (char) ch;
1655 } else {
1656 bitsleft += 16;
1657 charsleft = (charsleft << 16) | ch;
1658 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1659
1660 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001661 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 or '-' then the shift sequence will be terminated implicitly and we
1663 don't have to insert a '-'. */
1664
1665 if (bitsleft == 0) {
1666 if (i + 1 < size) {
1667 Py_UNICODE ch2 = s[i+1];
1668
1669 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001670
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001671 } else if (B64CHAR(ch2) || ch2 == '-') {
1672 *out++ = '-';
1673 inShift = 0;
1674 } else {
1675 inShift = 0;
1676 }
1677
1678 }
1679 else {
1680 *out++ = '-';
1681 inShift = 0;
1682 }
1683 }
Tim Petersced69f82003-09-16 20:30:58 +00001684 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001686 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001687 if (bitsleft) {
1688 *out++= B64(charsleft << (6-bitsleft) );
1689 *out++ = '-';
1690 }
1691
Walter Dörwald51ab4142007-05-05 14:43:36 +00001692 if (PyBytes_Resize(v, out - start)) {
1693 Py_DECREF(v);
1694 return NULL;
1695 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001696 return v;
1697}
1698
1699#undef SPECIAL
1700#undef B64
1701#undef B64CHAR
1702#undef UB64
1703#undef ENCODE
1704#undef DECODE
1705
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706/* --- UTF-8 Codec -------------------------------------------------------- */
1707
Tim Petersced69f82003-09-16 20:30:58 +00001708static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709char utf8_code_length[256] = {
1710 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1711 illegal prefix. see RFC 2279 for details */
1712 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1713 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1714 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1715 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1716 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1720 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1722 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1723 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1724 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1725 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1726 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1727 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1728};
1729
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001731 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 const char *errors)
1733{
Walter Dörwald69652032004-09-07 20:24:22 +00001734 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1735}
1736
1737PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001738 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001739 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001740 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001744 Py_ssize_t startinpos;
1745 Py_ssize_t endinpos;
1746 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 const char *e;
1748 PyUnicodeObject *unicode;
1749 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001750 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001751 PyObject *errorHandler = NULL;
1752 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 /* Note: size will always be longer than the resulting Unicode
1755 character count */
1756 unicode = _PyUnicode_New(size);
1757 if (!unicode)
1758 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001759 if (size == 0) {
1760 if (consumed)
1761 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764
1765 /* Unpack UTF-8 encoded data */
1766 p = unicode->str;
1767 e = s + size;
1768
1769 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001770 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771
1772 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001773 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 s++;
1775 continue;
1776 }
1777
1778 n = utf8_code_length[ch];
1779
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001781 if (consumed)
1782 break;
1783 else {
1784 errmsg = "unexpected end of data";
1785 startinpos = s-starts;
1786 endinpos = size;
1787 goto utf8Error;
1788 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790
1791 switch (n) {
1792
1793 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001794 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 startinpos = s-starts;
1796 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001797 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
1799 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001800 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 startinpos = s-starts;
1802 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001803 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804
1805 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001806 if ((s[1] & 0xc0) != 0x80) {
1807 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 startinpos = s-starts;
1809 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001810 goto utf8Error;
1811 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001813 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001814 startinpos = s-starts;
1815 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001816 errmsg = "illegal encoding";
1817 goto utf8Error;
1818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821 break;
1822
1823 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001824 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001825 (s[2] & 0xc0) != 0x80) {
1826 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 startinpos = s-starts;
1828 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 goto utf8Error;
1830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001832 if (ch < 0x0800) {
1833 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001834 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001835
1836 XXX For wide builds (UCS-4) we should probably try
1837 to recombine the surrogates into a single code
1838 unit.
1839 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 startinpos = s-starts;
1842 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001843 goto utf8Error;
1844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001846 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001847 break;
1848
1849 case 4:
1850 if ((s[1] & 0xc0) != 0x80 ||
1851 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001852 (s[3] & 0xc0) != 0x80) {
1853 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 startinpos = s-starts;
1855 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 goto utf8Error;
1857 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001858 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1859 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1860 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001861 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001862 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001863 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001864 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001865 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001866 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 startinpos = s-starts;
1868 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001869 goto utf8Error;
1870 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001871#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001872 *p++ = (Py_UNICODE)ch;
1873#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001874 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001875
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001876 /* translate from 10000..10FFFF to 0..FFFF */
1877 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001878
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001879 /* high surrogate = top 10 bits added to D800 */
1880 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001881
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001882 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001883 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001884#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 break;
1886
1887 default:
1888 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001889 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 startinpos = s-starts;
1891 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001892 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893 }
1894 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001895 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001896
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001897 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 outpos = p-PyUnicode_AS_UNICODE(unicode);
1899 if (unicode_decode_call_errorhandler(
1900 errors, &errorHandler,
1901 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001902 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001903 (PyObject **)&unicode, &outpos, &p))
1904 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 }
Walter Dörwald69652032004-09-07 20:24:22 +00001906 if (consumed)
1907 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908
1909 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001910 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911 goto onError;
1912
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001913 Py_XDECREF(errorHandler);
1914 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 return (PyObject *)unicode;
1916
1917onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001918 Py_XDECREF(errorHandler);
1919 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 Py_DECREF(unicode);
1921 return NULL;
1922}
1923
Tim Peters602f7402002-04-27 18:03:26 +00001924/* Allocation strategy: if the string is short, convert into a stack buffer
1925 and allocate exactly as much space needed at the end. Else allocate the
1926 maximum possible needed (4 result bytes per Unicode character), and return
1927 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001928*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001929PyObject *
1930PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001931 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001932 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933{
Tim Peters602f7402002-04-27 18:03:26 +00001934#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001935
Martin v. Löwis18e16552006-02-15 17:27:45 +00001936 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001937 PyObject *v; /* result string object */
1938 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001939 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001940 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001941 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001942
Tim Peters602f7402002-04-27 18:03:26 +00001943 assert(s != NULL);
1944 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945
Tim Peters602f7402002-04-27 18:03:26 +00001946 if (size <= MAX_SHORT_UNICHARS) {
1947 /* Write into the stack buffer; nallocated can't overflow.
1948 * At the end, we'll allocate exactly as much heap space as it
1949 * turns out we need.
1950 */
1951 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1952 v = NULL; /* will allocate after we're done */
1953 p = stackbuf;
1954 }
1955 else {
1956 /* Overallocate on the heap, and give the excess back at the end. */
1957 nallocated = size * 4;
1958 if (nallocated / 4 != size) /* overflow! */
1959 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001960 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001961 if (v == NULL)
1962 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001963 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001964 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001965
Tim Peters602f7402002-04-27 18:03:26 +00001966 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001967 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001968
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001969 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001970 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001972
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001974 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001975 *p++ = (char)(0xc0 | (ch >> 6));
1976 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001977 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001978 else {
Tim Peters602f7402002-04-27 18:03:26 +00001979 /* Encode UCS2 Unicode ordinals */
1980 if (ch < 0x10000) {
1981 /* Special case: check for high surrogate */
1982 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1983 Py_UCS4 ch2 = s[i];
1984 /* Check for low surrogate and combine the two to
1985 form a UCS4 value */
1986 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001987 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001988 i++;
1989 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001990 }
Tim Peters602f7402002-04-27 18:03:26 +00001991 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001992 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001993 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001994 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1995 *p++ = (char)(0x80 | (ch & 0x3f));
1996 continue;
1997 }
1998encodeUCS4:
1999 /* Encode UCS4 Unicode ordinals */
2000 *p++ = (char)(0xf0 | (ch >> 18));
2001 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2002 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2003 *p++ = (char)(0x80 | (ch & 0x3f));
2004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002006
Tim Peters602f7402002-04-27 18:03:26 +00002007 if (v == NULL) {
2008 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002009 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002010 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002011 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002012 }
2013 else {
2014 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002015 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002016 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002017 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002020
Tim Peters602f7402002-04-27 18:03:26 +00002021#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022}
2023
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2025{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 if (!PyUnicode_Check(unicode)) {
2027 PyErr_BadArgument();
2028 return NULL;
2029 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002030 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2031 PyUnicode_GET_SIZE(unicode),
2032 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033}
2034
Walter Dörwald41980ca2007-08-16 21:55:45 +00002035/* --- UTF-32 Codec ------------------------------------------------------- */
2036
2037PyObject *
2038PyUnicode_DecodeUTF32(const char *s,
2039 Py_ssize_t size,
2040 const char *errors,
2041 int *byteorder)
2042{
2043 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2044}
2045
2046PyObject *
2047PyUnicode_DecodeUTF32Stateful(const char *s,
2048 Py_ssize_t size,
2049 const char *errors,
2050 int *byteorder,
2051 Py_ssize_t *consumed)
2052{
2053 const char *starts = s;
2054 Py_ssize_t startinpos;
2055 Py_ssize_t endinpos;
2056 Py_ssize_t outpos;
2057 PyUnicodeObject *unicode;
2058 Py_UNICODE *p;
2059#ifndef Py_UNICODE_WIDE
2060 int i, pairs;
2061#else
2062 const int pairs = 0;
2063#endif
2064 const unsigned char *q, *e;
2065 int bo = 0; /* assume native ordering by default */
2066 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002067 /* Offsets from q for retrieving bytes in the right order. */
2068#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2069 int iorder[] = {0, 1, 2, 3};
2070#else
2071 int iorder[] = {3, 2, 1, 0};
2072#endif
2073 PyObject *errorHandler = NULL;
2074 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002075 /* On narrow builds we split characters outside the BMP into two
2076 codepoints => count how much extra space we need. */
2077#ifndef Py_UNICODE_WIDE
2078 for (i = pairs = 0; i < size/4; i++)
2079 if (((Py_UCS4 *)s)[i] >= 0x10000)
2080 pairs++;
2081#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002082
2083 /* This might be one to much, because of a BOM */
2084 unicode = _PyUnicode_New((size+3)/4+pairs);
2085 if (!unicode)
2086 return NULL;
2087 if (size == 0)
2088 return (PyObject *)unicode;
2089
2090 /* Unpack UTF-32 encoded data */
2091 p = unicode->str;
2092 q = (unsigned char *)s;
2093 e = q + size;
2094
2095 if (byteorder)
2096 bo = *byteorder;
2097
2098 /* Check for BOM marks (U+FEFF) in the input and adjust current
2099 byte order setting accordingly. In native mode, the leading BOM
2100 mark is skipped, in all other modes, it is copied to the output
2101 stream as-is (giving a ZWNBSP character). */
2102 if (bo == 0) {
2103 if (size >= 4) {
2104 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2105 (q[iorder[1]] << 8) | q[iorder[0]];
2106#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2107 if (bom == 0x0000FEFF) {
2108 q += 4;
2109 bo = -1;
2110 }
2111 else if (bom == 0xFFFE0000) {
2112 q += 4;
2113 bo = 1;
2114 }
2115#else
2116 if (bom == 0x0000FEFF) {
2117 q += 4;
2118 bo = 1;
2119 }
2120 else if (bom == 0xFFFE0000) {
2121 q += 4;
2122 bo = -1;
2123 }
2124#endif
2125 }
2126 }
2127
2128 if (bo == -1) {
2129 /* force LE */
2130 iorder[0] = 0;
2131 iorder[1] = 1;
2132 iorder[2] = 2;
2133 iorder[3] = 3;
2134 }
2135 else if (bo == 1) {
2136 /* force BE */
2137 iorder[0] = 3;
2138 iorder[1] = 2;
2139 iorder[2] = 1;
2140 iorder[3] = 0;
2141 }
2142
2143 while (q < e) {
2144 Py_UCS4 ch;
2145 /* remaining bytes at the end? (size should be divisible by 4) */
2146 if (e-q<4) {
2147 if (consumed)
2148 break;
2149 errmsg = "truncated data";
2150 startinpos = ((const char *)q)-starts;
2151 endinpos = ((const char *)e)-starts;
2152 goto utf32Error;
2153 /* The remaining input chars are ignored if the callback
2154 chooses to skip the input */
2155 }
2156 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2157 (q[iorder[1]] << 8) | q[iorder[0]];
2158
2159 if (ch >= 0x110000)
2160 {
2161 errmsg = "codepoint not in range(0x110000)";
2162 startinpos = ((const char *)q)-starts;
2163 endinpos = startinpos+4;
2164 goto utf32Error;
2165 }
2166#ifndef Py_UNICODE_WIDE
2167 if (ch >= 0x10000)
2168 {
2169 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2170 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2171 }
2172 else
2173#endif
2174 *p++ = ch;
2175 q += 4;
2176 continue;
2177 utf32Error:
2178 outpos = p-PyUnicode_AS_UNICODE(unicode);
2179 if (unicode_decode_call_errorhandler(
2180 errors, &errorHandler,
2181 "utf32", errmsg,
2182 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2183 (PyObject **)&unicode, &outpos, &p))
2184 goto onError;
2185 }
2186
2187 if (byteorder)
2188 *byteorder = bo;
2189
2190 if (consumed)
2191 *consumed = (const char *)q-starts;
2192
2193 /* Adjust length */
2194 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2195 goto onError;
2196
2197 Py_XDECREF(errorHandler);
2198 Py_XDECREF(exc);
2199 return (PyObject *)unicode;
2200
2201onError:
2202 Py_DECREF(unicode);
2203 Py_XDECREF(errorHandler);
2204 Py_XDECREF(exc);
2205 return NULL;
2206}
2207
2208PyObject *
2209PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2210 Py_ssize_t size,
2211 const char *errors,
2212 int byteorder)
2213{
2214 PyObject *v;
2215 unsigned char *p;
2216#ifndef Py_UNICODE_WIDE
2217 int i, pairs;
2218#else
2219 const int pairs = 0;
2220#endif
2221 /* Offsets from p for storing byte pairs in the right order. */
2222#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2223 int iorder[] = {0, 1, 2, 3};
2224#else
2225 int iorder[] = {3, 2, 1, 0};
2226#endif
2227
2228#define STORECHAR(CH) \
2229 do { \
2230 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2231 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2232 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2233 p[iorder[0]] = (CH) & 0xff; \
2234 p += 4; \
2235 } while(0)
2236
2237 /* In narrow builds we can output surrogate pairs as one codepoint,
2238 so we need less space. */
2239#ifndef Py_UNICODE_WIDE
2240 for (i = pairs = 0; i < size-1; i++)
2241 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2242 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2243 pairs++;
2244#endif
2245 v = PyBytes_FromStringAndSize(NULL,
2246 4 * (size - pairs + (byteorder == 0)));
2247 if (v == NULL)
2248 return NULL;
2249
2250 p = (unsigned char *)PyBytes_AS_STRING(v);
2251 if (byteorder == 0)
2252 STORECHAR(0xFEFF);
2253 if (size == 0)
2254 return v;
2255
2256 if (byteorder == -1) {
2257 /* force LE */
2258 iorder[0] = 0;
2259 iorder[1] = 1;
2260 iorder[2] = 2;
2261 iorder[3] = 3;
2262 }
2263 else if (byteorder == 1) {
2264 /* force BE */
2265 iorder[0] = 3;
2266 iorder[1] = 2;
2267 iorder[2] = 1;
2268 iorder[3] = 0;
2269 }
2270
2271 while (size-- > 0) {
2272 Py_UCS4 ch = *s++;
2273#ifndef Py_UNICODE_WIDE
2274 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2275 Py_UCS4 ch2 = *s;
2276 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2277 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2278 s++;
2279 size--;
2280 }
2281 }
2282#endif
2283 STORECHAR(ch);
2284 }
2285 return v;
2286#undef STORECHAR
2287}
2288
2289PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2290{
2291 if (!PyUnicode_Check(unicode)) {
2292 PyErr_BadArgument();
2293 return NULL;
2294 }
2295 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2296 PyUnicode_GET_SIZE(unicode),
2297 NULL,
2298 0);
2299}
2300
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301/* --- UTF-16 Codec ------------------------------------------------------- */
2302
Tim Peters772747b2001-08-09 22:21:55 +00002303PyObject *
2304PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002305 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002306 const char *errors,
2307 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308{
Walter Dörwald69652032004-09-07 20:24:22 +00002309 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2310}
2311
2312PyObject *
2313PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002314 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002315 const char *errors,
2316 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002317 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002318{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002319 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002320 Py_ssize_t startinpos;
2321 Py_ssize_t endinpos;
2322 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323 PyUnicodeObject *unicode;
2324 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002325 const unsigned char *q, *e;
2326 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002327 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002328 /* Offsets from q for retrieving byte pairs in the right order. */
2329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2330 int ihi = 1, ilo = 0;
2331#else
2332 int ihi = 0, ilo = 1;
2333#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002334 PyObject *errorHandler = NULL;
2335 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336
2337 /* Note: size will always be longer than the resulting Unicode
2338 character count */
2339 unicode = _PyUnicode_New(size);
2340 if (!unicode)
2341 return NULL;
2342 if (size == 0)
2343 return (PyObject *)unicode;
2344
2345 /* Unpack UTF-16 encoded data */
2346 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002347 q = (unsigned char *)s;
2348 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349
2350 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002351 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002353 /* Check for BOM marks (U+FEFF) in the input and adjust current
2354 byte order setting accordingly. In native mode, the leading BOM
2355 mark is skipped, in all other modes, it is copied to the output
2356 stream as-is (giving a ZWNBSP character). */
2357 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002358 if (size >= 2) {
2359 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002360#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002361 if (bom == 0xFEFF) {
2362 q += 2;
2363 bo = -1;
2364 }
2365 else if (bom == 0xFFFE) {
2366 q += 2;
2367 bo = 1;
2368 }
Tim Petersced69f82003-09-16 20:30:58 +00002369#else
Walter Dörwald69652032004-09-07 20:24:22 +00002370 if (bom == 0xFEFF) {
2371 q += 2;
2372 bo = 1;
2373 }
2374 else if (bom == 0xFFFE) {
2375 q += 2;
2376 bo = -1;
2377 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002378#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002379 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381
Tim Peters772747b2001-08-09 22:21:55 +00002382 if (bo == -1) {
2383 /* force LE */
2384 ihi = 1;
2385 ilo = 0;
2386 }
2387 else if (bo == 1) {
2388 /* force BE */
2389 ihi = 0;
2390 ilo = 1;
2391 }
2392
2393 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002394 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002395 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002396 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002397 if (consumed)
2398 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002399 errmsg = "truncated data";
2400 startinpos = ((const char *)q)-starts;
2401 endinpos = ((const char *)e)-starts;
2402 goto utf16Error;
2403 /* The remaining input chars are ignored if the callback
2404 chooses to skip the input */
2405 }
2406 ch = (q[ihi] << 8) | q[ilo];
2407
Tim Peters772747b2001-08-09 22:21:55 +00002408 q += 2;
2409
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 if (ch < 0xD800 || ch > 0xDFFF) {
2411 *p++ = ch;
2412 continue;
2413 }
2414
2415 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002416 if (q >= e) {
2417 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002418 startinpos = (((const char *)q)-2)-starts;
2419 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002420 goto utf16Error;
2421 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002422 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002423 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2424 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002425 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002426#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002427 *p++ = ch;
2428 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002429#else
2430 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002431#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002432 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002433 }
2434 else {
2435 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002436 startinpos = (((const char *)q)-4)-starts;
2437 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002438 goto utf16Error;
2439 }
2440
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002442 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002443 startinpos = (((const char *)q)-2)-starts;
2444 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002445 /* Fall through to report the error */
2446
2447 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002448 outpos = p-PyUnicode_AS_UNICODE(unicode);
2449 if (unicode_decode_call_errorhandler(
2450 errors, &errorHandler,
2451 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002452 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002453 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002454 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455 }
2456
2457 if (byteorder)
2458 *byteorder = bo;
2459
Walter Dörwald69652032004-09-07 20:24:22 +00002460 if (consumed)
2461 *consumed = (const char *)q-starts;
2462
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002464 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 goto onError;
2466
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002467 Py_XDECREF(errorHandler);
2468 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 return (PyObject *)unicode;
2470
2471onError:
2472 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 Py_XDECREF(errorHandler);
2474 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475 return NULL;
2476}
2477
Tim Peters772747b2001-08-09 22:21:55 +00002478PyObject *
2479PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002480 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002481 const char *errors,
2482 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483{
2484 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002485 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002486#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002487 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002488#else
2489 const int pairs = 0;
2490#endif
Tim Peters772747b2001-08-09 22:21:55 +00002491 /* Offsets from p for storing byte pairs in the right order. */
2492#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2493 int ihi = 1, ilo = 0;
2494#else
2495 int ihi = 0, ilo = 1;
2496#endif
2497
2498#define STORECHAR(CH) \
2499 do { \
2500 p[ihi] = ((CH) >> 8) & 0xff; \
2501 p[ilo] = (CH) & 0xff; \
2502 p += 2; \
2503 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002505#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002506 for (i = pairs = 0; i < size; i++)
2507 if (s[i] >= 0x10000)
2508 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002509#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002510 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002511 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 if (v == NULL)
2513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514
Walter Dörwald3cc34522007-05-04 10:48:27 +00002515 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002517 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002518 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002519 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002520
2521 if (byteorder == -1) {
2522 /* force LE */
2523 ihi = 1;
2524 ilo = 0;
2525 }
2526 else if (byteorder == 1) {
2527 /* force BE */
2528 ihi = 0;
2529 ilo = 1;
2530 }
2531
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002532 while (size-- > 0) {
2533 Py_UNICODE ch = *s++;
2534 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002535#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002536 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002537 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2538 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002540#endif
Tim Peters772747b2001-08-09 22:21:55 +00002541 STORECHAR(ch);
2542 if (ch2)
2543 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002546#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547}
2548
2549PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2550{
2551 if (!PyUnicode_Check(unicode)) {
2552 PyErr_BadArgument();
2553 return NULL;
2554 }
2555 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2556 PyUnicode_GET_SIZE(unicode),
2557 NULL,
2558 0);
2559}
2560
2561/* --- Unicode Escape Codec ----------------------------------------------- */
2562
Fredrik Lundh06d12682001-01-24 07:59:11 +00002563static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002564
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002566 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567 const char *errors)
2568{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002569 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002570 Py_ssize_t startinpos;
2571 Py_ssize_t endinpos;
2572 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002575 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002577 char* message;
2578 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579 PyObject *errorHandler = NULL;
2580 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002581
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 /* Escaped strings will always be longer than the resulting
2583 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002584 length after conversion to the true value.
2585 (but if the error callback returns a long replacement string
2586 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 v = _PyUnicode_New(size);
2588 if (v == NULL)
2589 goto onError;
2590 if (size == 0)
2591 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002593 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002595
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 while (s < end) {
2597 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002598 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002599 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600
2601 /* Non-escape characters are interpreted as Unicode ordinals */
2602 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002603 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 continue;
2605 }
2606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 /* \ - Escapes */
2609 s++;
2610 switch (*s++) {
2611
2612 /* \x escapes */
2613 case '\n': break;
2614 case '\\': *p++ = '\\'; break;
2615 case '\'': *p++ = '\''; break;
2616 case '\"': *p++ = '\"'; break;
2617 case 'b': *p++ = '\b'; break;
2618 case 'f': *p++ = '\014'; break; /* FF */
2619 case 't': *p++ = '\t'; break;
2620 case 'n': *p++ = '\n'; break;
2621 case 'r': *p++ = '\r'; break;
2622 case 'v': *p++ = '\013'; break; /* VT */
2623 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2624
2625 /* \OOO (octal) escapes */
2626 case '0': case '1': case '2': case '3':
2627 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002628 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002630 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002632 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002634 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 break;
2636
Fredrik Lundhccc74732001-02-18 22:13:49 +00002637 /* hex escapes */
2638 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002640 digits = 2;
2641 message = "truncated \\xXX escape";
2642 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643
Fredrik Lundhccc74732001-02-18 22:13:49 +00002644 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002646 digits = 4;
2647 message = "truncated \\uXXXX escape";
2648 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649
Fredrik Lundhccc74732001-02-18 22:13:49 +00002650 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002651 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002652 digits = 8;
2653 message = "truncated \\UXXXXXXXX escape";
2654 hexescape:
2655 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002656 outpos = p-PyUnicode_AS_UNICODE(v);
2657 if (s+digits>end) {
2658 endinpos = size;
2659 if (unicode_decode_call_errorhandler(
2660 errors, &errorHandler,
2661 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002662 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 (PyObject **)&v, &outpos, &p))
2664 goto onError;
2665 goto nextByte;
2666 }
2667 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002668 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002669 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 endinpos = (s+i+1)-starts;
2671 if (unicode_decode_call_errorhandler(
2672 errors, &errorHandler,
2673 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002674 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002676 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002677 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002678 }
2679 chr = (chr<<4) & ~0xF;
2680 if (c >= '0' && c <= '9')
2681 chr += c - '0';
2682 else if (c >= 'a' && c <= 'f')
2683 chr += 10 + c - 'a';
2684 else
2685 chr += 10 + c - 'A';
2686 }
2687 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002688 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 /* _decoding_error will have already written into the
2690 target buffer. */
2691 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002692 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002693 /* when we get here, chr is a 32-bit unicode character */
2694 if (chr <= 0xffff)
2695 /* UCS-2 character */
2696 *p++ = (Py_UNICODE) chr;
2697 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002698 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002699 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002700#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002701 *p++ = chr;
2702#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002703 chr -= 0x10000L;
2704 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002705 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002706#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002707 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002708 endinpos = s-starts;
2709 outpos = p-PyUnicode_AS_UNICODE(v);
2710 if (unicode_decode_call_errorhandler(
2711 errors, &errorHandler,
2712 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002713 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002714 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002715 goto onError;
2716 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002717 break;
2718
2719 /* \N{name} */
2720 case 'N':
2721 message = "malformed \\N character escape";
2722 if (ucnhash_CAPI == NULL) {
2723 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002724 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002725 m = PyImport_ImportModule("unicodedata");
2726 if (m == NULL)
2727 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002728 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002729 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002730 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002731 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002732 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002733 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002734 if (ucnhash_CAPI == NULL)
2735 goto ucnhashError;
2736 }
2737 if (*s == '{') {
2738 const char *start = s+1;
2739 /* look for the closing brace */
2740 while (*s != '}' && s < end)
2741 s++;
2742 if (s > start && s < end && *s == '}') {
2743 /* found a name. look it up in the unicode database */
2744 message = "unknown Unicode character name";
2745 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002746 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002747 goto store;
2748 }
2749 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002750 endinpos = s-starts;
2751 outpos = p-PyUnicode_AS_UNICODE(v);
2752 if (unicode_decode_call_errorhandler(
2753 errors, &errorHandler,
2754 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002755 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002757 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002758 break;
2759
2760 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002761 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 message = "\\ at end of string";
2763 s--;
2764 endinpos = s-starts;
2765 outpos = p-PyUnicode_AS_UNICODE(v);
2766 if (unicode_decode_call_errorhandler(
2767 errors, &errorHandler,
2768 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002769 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002771 goto onError;
2772 }
2773 else {
2774 *p++ = '\\';
2775 *p++ = (unsigned char)s[-1];
2776 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002777 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 nextByte:
2780 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002782 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002783 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002784 Py_XDECREF(errorHandler);
2785 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002787
Fredrik Lundhccc74732001-02-18 22:13:49 +00002788ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002789 PyErr_SetString(
2790 PyExc_UnicodeError,
2791 "\\N escapes not supported (can't load unicodedata module)"
2792 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002793 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002794 Py_XDECREF(errorHandler);
2795 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002796 return NULL;
2797
Fredrik Lundhccc74732001-02-18 22:13:49 +00002798onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 Py_XDECREF(errorHandler);
2801 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 return NULL;
2803}
2804
2805/* Return a Unicode-Escape string version of the Unicode object.
2806
2807 If quotes is true, the string is enclosed in u"" or u'' quotes as
2808 appropriate.
2809
2810*/
2811
Thomas Wouters477c8d52006-05-27 19:21:47 +00002812Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2813 Py_ssize_t size,
2814 Py_UNICODE ch)
2815{
2816 /* like wcschr, but doesn't stop at NULL characters */
2817
2818 while (size-- > 0) {
2819 if (*s == ch)
2820 return s;
2821 s++;
2822 }
2823
2824 return NULL;
2825}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002826
Walter Dörwald79e913e2007-05-12 11:08:06 +00002827static const char *hexdigits = "0123456789abcdef";
2828
2829PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2830 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831{
2832 PyObject *repr;
2833 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834
Thomas Wouters89f507f2006-12-13 04:49:30 +00002835 /* XXX(nnorwitz): rather than over-allocating, it would be
2836 better to choose a different scheme. Perhaps scan the
2837 first N-chars of the string and allocate based on that size.
2838 */
2839 /* Initial allocation is based on the longest-possible unichr
2840 escape.
2841
2842 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2843 unichr, so in this case it's the longest unichr escape. In
2844 narrow (UTF-16) builds this is five chars per source unichr
2845 since there are two unichrs in the surrogate pair, so in narrow
2846 (UTF-16) builds it's not the longest unichr escape.
2847
2848 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2849 so in the narrow (UTF-16) build case it's the longest unichr
2850 escape.
2851 */
2852
Walter Dörwald79e913e2007-05-12 11:08:06 +00002853 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002854#ifdef Py_UNICODE_WIDE
2855 + 10*size
2856#else
2857 + 6*size
2858#endif
2859 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 if (repr == NULL)
2861 return NULL;
2862
Walter Dörwald79e913e2007-05-12 11:08:06 +00002863 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 while (size-- > 0) {
2866 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002867
Walter Dörwald79e913e2007-05-12 11:08:06 +00002868 /* Escape backslashes */
2869 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 *p++ = '\\';
2871 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002872 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002873 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002874
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002875#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002876 /* Map 21-bit characters to '\U00xxxxxx' */
2877 else if (ch >= 0x10000) {
2878 *p++ = '\\';
2879 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002880 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2881 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2882 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2883 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2884 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2885 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2886 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2887 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002888 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002889 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002890#else
2891 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002892 else if (ch >= 0xD800 && ch < 0xDC00) {
2893 Py_UNICODE ch2;
2894 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002895
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002896 ch2 = *s++;
2897 size--;
2898 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2899 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2900 *p++ = '\\';
2901 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002902 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2903 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2904 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2905 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2906 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2907 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2908 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2909 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002910 continue;
2911 }
2912 /* Fall through: isolated surrogates are copied as-is */
2913 s--;
2914 size++;
2915 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002916#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002917
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002919 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 *p++ = '\\';
2921 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002922 *p++ = hexdigits[(ch >> 12) & 0x000F];
2923 *p++ = hexdigits[(ch >> 8) & 0x000F];
2924 *p++ = hexdigits[(ch >> 4) & 0x000F];
2925 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002927
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002928 /* Map special whitespace to '\t', \n', '\r' */
2929 else if (ch == '\t') {
2930 *p++ = '\\';
2931 *p++ = 't';
2932 }
2933 else if (ch == '\n') {
2934 *p++ = '\\';
2935 *p++ = 'n';
2936 }
2937 else if (ch == '\r') {
2938 *p++ = '\\';
2939 *p++ = 'r';
2940 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002941
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002942 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002943 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002945 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002946 *p++ = hexdigits[(ch >> 4) & 0x000F];
2947 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002948 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002949
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 /* Copy everything else as-is */
2951 else
2952 *p++ = (char) ch;
2953 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954
2955 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002956 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2957 Py_DECREF(repr);
2958 return NULL;
2959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960 return repr;
2961}
2962
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2964{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002965 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 if (!PyUnicode_Check(unicode)) {
2967 PyErr_BadArgument();
2968 return NULL;
2969 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002970 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2971 PyUnicode_GET_SIZE(unicode));
2972
2973 if (!s)
2974 return NULL;
2975 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2976 PyBytes_GET_SIZE(s));
2977 Py_DECREF(s);
2978 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979}
2980
2981/* --- Raw Unicode Escape Codec ------------------------------------------- */
2982
2983PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002984 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 const char *errors)
2986{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002987 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002988 Py_ssize_t startinpos;
2989 Py_ssize_t endinpos;
2990 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002992 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 const char *end;
2994 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002995 PyObject *errorHandler = NULL;
2996 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002997
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 /* Escaped strings will always be longer than the resulting
2999 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 length after conversion to the true value. (But decoding error
3001 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 v = _PyUnicode_New(size);
3003 if (v == NULL)
3004 goto onError;
3005 if (size == 0)
3006 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 end = s + size;
3009 while (s < end) {
3010 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003011 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003013 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014
3015 /* Non-escape characters are interpreted as Unicode ordinals */
3016 if (*s != '\\') {
3017 *p++ = (unsigned char)*s++;
3018 continue;
3019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021
3022 /* \u-escapes are only interpreted iff the number of leading
3023 backslashes if odd */
3024 bs = s;
3025 for (;s < end;) {
3026 if (*s != '\\')
3027 break;
3028 *p++ = (unsigned char)*s++;
3029 }
3030 if (((s - bs) & 1) == 0 ||
3031 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003032 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 continue;
3034 }
3035 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003036 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 s++;
3038
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003039 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003041 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 endinpos = s-starts;
3045 if (unicode_decode_call_errorhandler(
3046 errors, &errorHandler,
3047 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003048 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 }
3053 x = (x<<4) & ~0xF;
3054 if (c >= '0' && c <= '9')
3055 x += c - '0';
3056 else if (c >= 'a' && c <= 'f')
3057 x += 10 + c - 'a';
3058 else
3059 x += 10 + c - 'A';
3060 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003061#ifndef Py_UNICODE_WIDE
3062 if (x > 0x10000) {
3063 if (unicode_decode_call_errorhandler(
3064 errors, &errorHandler,
3065 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003066 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003067 (PyObject **)&v, &outpos, &p))
3068 goto onError;
3069 }
3070#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071 *p++ = x;
3072 nextByte:
3073 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003075 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003076 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 Py_XDECREF(errorHandler);
3078 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003080
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 onError:
3082 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 Py_XDECREF(errorHandler);
3084 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 return NULL;
3086}
3087
3088PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003089 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090{
3091 PyObject *repr;
3092 char *p;
3093 char *q;
3094
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003095#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003096 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003097#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003098 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003099#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 if (repr == NULL)
3101 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003102 if (size == 0)
3103 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104
Walter Dörwald711005d2007-05-12 12:03:26 +00003105 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106 while (size-- > 0) {
3107 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003108#ifdef Py_UNICODE_WIDE
3109 /* Map 32-bit characters to '\Uxxxxxxxx' */
3110 if (ch >= 0x10000) {
3111 *p++ = '\\';
3112 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003113 *p++ = hexdigits[(ch >> 28) & 0xf];
3114 *p++ = hexdigits[(ch >> 24) & 0xf];
3115 *p++ = hexdigits[(ch >> 20) & 0xf];
3116 *p++ = hexdigits[(ch >> 16) & 0xf];
3117 *p++ = hexdigits[(ch >> 12) & 0xf];
3118 *p++ = hexdigits[(ch >> 8) & 0xf];
3119 *p++ = hexdigits[(ch >> 4) & 0xf];
3120 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003121 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003122 else
3123#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 /* Map 16-bit characters to '\uxxxx' */
3125 if (ch >= 256) {
3126 *p++ = '\\';
3127 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003128 *p++ = hexdigits[(ch >> 12) & 0xf];
3129 *p++ = hexdigits[(ch >> 8) & 0xf];
3130 *p++ = hexdigits[(ch >> 4) & 0xf];
3131 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 }
3133 /* Copy everything else as-is */
3134 else
3135 *p++ = (char) ch;
3136 }
3137 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003138 if (PyBytes_Resize(repr, p - q)) {
3139 Py_DECREF(repr);
3140 return NULL;
3141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142 return repr;
3143}
3144
3145PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3146{
Walter Dörwald711005d2007-05-12 12:03:26 +00003147 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003149 PyErr_BadArgument();
3150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003152 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3153 PyUnicode_GET_SIZE(unicode));
3154
3155 if (!s)
3156 return NULL;
3157 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3158 PyBytes_GET_SIZE(s));
3159 Py_DECREF(s);
3160 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161}
3162
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003163/* --- Unicode Internal Codec ------------------------------------------- */
3164
3165PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003166 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003167 const char *errors)
3168{
3169 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003170 Py_ssize_t startinpos;
3171 Py_ssize_t endinpos;
3172 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003173 PyUnicodeObject *v;
3174 Py_UNICODE *p;
3175 const char *end;
3176 const char *reason;
3177 PyObject *errorHandler = NULL;
3178 PyObject *exc = NULL;
3179
Neal Norwitzd43069c2006-01-08 01:12:10 +00003180#ifdef Py_UNICODE_WIDE
3181 Py_UNICODE unimax = PyUnicode_GetMax();
3182#endif
3183
Thomas Wouters89f507f2006-12-13 04:49:30 +00003184 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003185 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3186 if (v == NULL)
3187 goto onError;
3188 if (PyUnicode_GetSize((PyObject *)v) == 0)
3189 return (PyObject *)v;
3190 p = PyUnicode_AS_UNICODE(v);
3191 end = s + size;
3192
3193 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003194 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003195 /* We have to sanity check the raw data, otherwise doom looms for
3196 some malformed UCS-4 data. */
3197 if (
3198 #ifdef Py_UNICODE_WIDE
3199 *p > unimax || *p < 0 ||
3200 #endif
3201 end-s < Py_UNICODE_SIZE
3202 )
3203 {
3204 startinpos = s - starts;
3205 if (end-s < Py_UNICODE_SIZE) {
3206 endinpos = end-starts;
3207 reason = "truncated input";
3208 }
3209 else {
3210 endinpos = s - starts + Py_UNICODE_SIZE;
3211 reason = "illegal code point (> 0x10FFFF)";
3212 }
3213 outpos = p - PyUnicode_AS_UNICODE(v);
3214 if (unicode_decode_call_errorhandler(
3215 errors, &errorHandler,
3216 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003217 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003218 (PyObject **)&v, &outpos, &p)) {
3219 goto onError;
3220 }
3221 }
3222 else {
3223 p++;
3224 s += Py_UNICODE_SIZE;
3225 }
3226 }
3227
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003228 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003229 goto onError;
3230 Py_XDECREF(errorHandler);
3231 Py_XDECREF(exc);
3232 return (PyObject *)v;
3233
3234 onError:
3235 Py_XDECREF(v);
3236 Py_XDECREF(errorHandler);
3237 Py_XDECREF(exc);
3238 return NULL;
3239}
3240
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241/* --- Latin-1 Codec ------------------------------------------------------ */
3242
3243PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003244 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 const char *errors)
3246{
3247 PyUnicodeObject *v;
3248 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003249
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003251 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003252 Py_UNICODE r = *(unsigned char*)s;
3253 return PyUnicode_FromUnicode(&r, 1);
3254 }
3255
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 v = _PyUnicode_New(size);
3257 if (v == NULL)
3258 goto onError;
3259 if (size == 0)
3260 return (PyObject *)v;
3261 p = PyUnicode_AS_UNICODE(v);
3262 while (size-- > 0)
3263 *p++ = (unsigned char)*s++;
3264 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003265
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 onError:
3267 Py_XDECREF(v);
3268 return NULL;
3269}
3270
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271/* create or adjust a UnicodeEncodeError */
3272static void make_encode_exception(PyObject **exceptionObject,
3273 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003274 const Py_UNICODE *unicode, Py_ssize_t size,
3275 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 if (*exceptionObject == NULL) {
3279 *exceptionObject = PyUnicodeEncodeError_Create(
3280 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 }
3282 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3284 goto onError;
3285 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3286 goto onError;
3287 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3288 goto onError;
3289 return;
3290 onError:
3291 Py_DECREF(*exceptionObject);
3292 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 }
3294}
3295
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296/* raises a UnicodeEncodeError */
3297static void raise_encode_exception(PyObject **exceptionObject,
3298 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003299 const Py_UNICODE *unicode, Py_ssize_t size,
3300 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003301 const char *reason)
3302{
3303 make_encode_exception(exceptionObject,
3304 encoding, unicode, size, startpos, endpos, reason);
3305 if (*exceptionObject != NULL)
3306 PyCodec_StrictErrors(*exceptionObject);
3307}
3308
3309/* error handling callback helper:
3310 build arguments, call the callback and check the arguments,
3311 put the result into newpos and return the replacement string, which
3312 has to be freed by the caller */
3313static PyObject *unicode_encode_call_errorhandler(const char *errors,
3314 PyObject **errorHandler,
3315 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003316 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3317 Py_ssize_t startpos, Py_ssize_t endpos,
3318 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003320 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321
3322 PyObject *restuple;
3323 PyObject *resunicode;
3324
3325 if (*errorHandler == NULL) {
3326 *errorHandler = PyCodec_LookupError(errors);
3327 if (*errorHandler == NULL)
3328 return NULL;
3329 }
3330
3331 make_encode_exception(exceptionObject,
3332 encoding, unicode, size, startpos, endpos, reason);
3333 if (*exceptionObject == NULL)
3334 return NULL;
3335
3336 restuple = PyObject_CallFunctionObjArgs(
3337 *errorHandler, *exceptionObject, NULL);
3338 if (restuple == NULL)
3339 return NULL;
3340 if (!PyTuple_Check(restuple)) {
3341 PyErr_Format(PyExc_TypeError, &argparse[4]);
3342 Py_DECREF(restuple);
3343 return NULL;
3344 }
3345 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3346 &resunicode, newpos)) {
3347 Py_DECREF(restuple);
3348 return NULL;
3349 }
3350 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003351 *newpos = size+*newpos;
3352 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003353 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003354 Py_DECREF(restuple);
3355 return NULL;
3356 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357 Py_INCREF(resunicode);
3358 Py_DECREF(restuple);
3359 return resunicode;
3360}
3361
3362static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003363 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 const char *errors,
3365 int limit)
3366{
3367 /* output object */
3368 PyObject *res;
3369 /* pointers to the beginning and end+1 of input */
3370 const Py_UNICODE *startp = p;
3371 const Py_UNICODE *endp = p + size;
3372 /* pointer to the beginning of the unencodable characters */
3373 /* const Py_UNICODE *badp = NULL; */
3374 /* pointer into the output */
3375 char *str;
3376 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003377 Py_ssize_t respos = 0;
3378 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003379 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3380 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381 PyObject *errorHandler = NULL;
3382 PyObject *exc = NULL;
3383 /* the following variable is used for caching string comparisons
3384 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3385 int known_errorHandler = -1;
3386
3387 /* allocate enough for a simple encoding without
3388 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003389 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 if (res == NULL)
3391 goto onError;
3392 if (size == 0)
3393 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003394 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003395 ressize = size;
3396
3397 while (p<endp) {
3398 Py_UNICODE c = *p;
3399
3400 /* can we encode this? */
3401 if (c<limit) {
3402 /* no overflow check, because we know that the space is enough */
3403 *str++ = (char)c;
3404 ++p;
3405 }
3406 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003407 Py_ssize_t unicodepos = p-startp;
3408 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003410 Py_ssize_t repsize;
3411 Py_ssize_t newpos;
3412 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 Py_UNICODE *uni2;
3414 /* startpos for collecting unencodable chars */
3415 const Py_UNICODE *collstart = p;
3416 const Py_UNICODE *collend = p;
3417 /* find all unecodable characters */
3418 while ((collend < endp) && ((*collend)>=limit))
3419 ++collend;
3420 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3421 if (known_errorHandler==-1) {
3422 if ((errors==NULL) || (!strcmp(errors, "strict")))
3423 known_errorHandler = 1;
3424 else if (!strcmp(errors, "replace"))
3425 known_errorHandler = 2;
3426 else if (!strcmp(errors, "ignore"))
3427 known_errorHandler = 3;
3428 else if (!strcmp(errors, "xmlcharrefreplace"))
3429 known_errorHandler = 4;
3430 else
3431 known_errorHandler = 0;
3432 }
3433 switch (known_errorHandler) {
3434 case 1: /* strict */
3435 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3436 goto onError;
3437 case 2: /* replace */
3438 while (collstart++<collend)
3439 *str++ = '?'; /* fall through */
3440 case 3: /* ignore */
3441 p = collend;
3442 break;
3443 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003444 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 /* determine replacement size (temporarily (mis)uses p) */
3446 for (p = collstart, repsize = 0; p < collend; ++p) {
3447 if (*p<10)
3448 repsize += 2+1+1;
3449 else if (*p<100)
3450 repsize += 2+2+1;
3451 else if (*p<1000)
3452 repsize += 2+3+1;
3453 else if (*p<10000)
3454 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003455#ifndef Py_UNICODE_WIDE
3456 else
3457 repsize += 2+5+1;
3458#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 else if (*p<100000)
3460 repsize += 2+5+1;
3461 else if (*p<1000000)
3462 repsize += 2+6+1;
3463 else
3464 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003465#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 }
3467 requiredsize = respos+repsize+(endp-collend);
3468 if (requiredsize > ressize) {
3469 if (requiredsize<2*ressize)
3470 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003471 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003473 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 ressize = requiredsize;
3475 }
3476 /* generate replacement (temporarily (mis)uses p) */
3477 for (p = collstart; p < collend; ++p) {
3478 str += sprintf(str, "&#%d;", (int)*p);
3479 }
3480 p = collend;
3481 break;
3482 default:
3483 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3484 encoding, reason, startp, size, &exc,
3485 collstart-startp, collend-startp, &newpos);
3486 if (repunicode == NULL)
3487 goto onError;
3488 /* need more space? (at least enough for what we
3489 have+the replacement+the rest of the string, so
3490 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003491 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 repsize = PyUnicode_GET_SIZE(repunicode);
3493 requiredsize = respos+repsize+(endp-collend);
3494 if (requiredsize > ressize) {
3495 if (requiredsize<2*ressize)
3496 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003497 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 Py_DECREF(repunicode);
3499 goto onError;
3500 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003501 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 ressize = requiredsize;
3503 }
3504 /* check if there is anything unencodable in the replacement
3505 and copy it to the output */
3506 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3507 c = *uni2;
3508 if (c >= limit) {
3509 raise_encode_exception(&exc, encoding, startp, size,
3510 unicodepos, unicodepos+1, reason);
3511 Py_DECREF(repunicode);
3512 goto onError;
3513 }
3514 *str = (char)c;
3515 }
3516 p = startp + newpos;
3517 Py_DECREF(repunicode);
3518 }
3519 }
3520 }
3521 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003522 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 if (respos<ressize)
3524 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003525 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 Py_XDECREF(errorHandler);
3527 Py_XDECREF(exc);
3528 return res;
3529
3530 onError:
3531 Py_XDECREF(res);
3532 Py_XDECREF(errorHandler);
3533 Py_XDECREF(exc);
3534 return NULL;
3535}
3536
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003538 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 const char *errors)
3540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542}
3543
3544PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3545{
3546 if (!PyUnicode_Check(unicode)) {
3547 PyErr_BadArgument();
3548 return NULL;
3549 }
3550 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3551 PyUnicode_GET_SIZE(unicode),
3552 NULL);
3553}
3554
3555/* --- 7-bit ASCII Codec -------------------------------------------------- */
3556
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003558 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 const char *errors)
3560{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 PyUnicodeObject *v;
3563 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003564 Py_ssize_t startinpos;
3565 Py_ssize_t endinpos;
3566 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 const char *e;
3568 PyObject *errorHandler = NULL;
3569 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003570
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003572 if (size == 1 && *(unsigned char*)s < 128) {
3573 Py_UNICODE r = *(unsigned char*)s;
3574 return PyUnicode_FromUnicode(&r, 1);
3575 }
Tim Petersced69f82003-09-16 20:30:58 +00003576
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 v = _PyUnicode_New(size);
3578 if (v == NULL)
3579 goto onError;
3580 if (size == 0)
3581 return (PyObject *)v;
3582 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 e = s + size;
3584 while (s < e) {
3585 register unsigned char c = (unsigned char)*s;
3586 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 ++s;
3589 }
3590 else {
3591 startinpos = s-starts;
3592 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003593 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 if (unicode_decode_call_errorhandler(
3595 errors, &errorHandler,
3596 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003597 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003602 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003603 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003604 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_XDECREF(errorHandler);
3606 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003608
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 onError:
3610 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 Py_XDECREF(errorHandler);
3612 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 return NULL;
3614}
3615
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003617 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 const char *errors)
3619{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621}
3622
3623PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3624{
3625 if (!PyUnicode_Check(unicode)) {
3626 PyErr_BadArgument();
3627 return NULL;
3628 }
3629 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3630 PyUnicode_GET_SIZE(unicode),
3631 NULL);
3632}
3633
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003635
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003636/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003637
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003638#if SIZEOF_INT < SIZEOF_SSIZE_T
3639#define NEED_RETRY
3640#endif
3641
3642/* XXX This code is limited to "true" double-byte encodings, as
3643 a) it assumes an incomplete character consists of a single byte, and
3644 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3645 encodings, see IsDBCSLeadByteEx documentation. */
3646
3647static int is_dbcs_lead_byte(const char *s, int offset)
3648{
3649 const char *curr = s + offset;
3650
3651 if (IsDBCSLeadByte(*curr)) {
3652 const char *prev = CharPrev(s, curr);
3653 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3654 }
3655 return 0;
3656}
3657
3658/*
3659 * Decode MBCS string into unicode object. If 'final' is set, converts
3660 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3661 */
3662static int decode_mbcs(PyUnicodeObject **v,
3663 const char *s, /* MBCS string */
3664 int size, /* sizeof MBCS string */
3665 int final)
3666{
3667 Py_UNICODE *p;
3668 Py_ssize_t n = 0;
3669 int usize = 0;
3670
3671 assert(size >= 0);
3672
3673 /* Skip trailing lead-byte unless 'final' is set */
3674 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3675 --size;
3676
3677 /* First get the size of the result */
3678 if (size > 0) {
3679 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3680 if (usize == 0) {
3681 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3682 return -1;
3683 }
3684 }
3685
3686 if (*v == NULL) {
3687 /* Create unicode object */
3688 *v = _PyUnicode_New(usize);
3689 if (*v == NULL)
3690 return -1;
3691 }
3692 else {
3693 /* Extend unicode object */
3694 n = PyUnicode_GET_SIZE(*v);
3695 if (_PyUnicode_Resize(v, n + usize) < 0)
3696 return -1;
3697 }
3698
3699 /* Do the conversion */
3700 if (size > 0) {
3701 p = PyUnicode_AS_UNICODE(*v) + n;
3702 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3703 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3704 return -1;
3705 }
3706 }
3707
3708 return size;
3709}
3710
3711PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3712 Py_ssize_t size,
3713 const char *errors,
3714 Py_ssize_t *consumed)
3715{
3716 PyUnicodeObject *v = NULL;
3717 int done;
3718
3719 if (consumed)
3720 *consumed = 0;
3721
3722#ifdef NEED_RETRY
3723 retry:
3724 if (size > INT_MAX)
3725 done = decode_mbcs(&v, s, INT_MAX, 0);
3726 else
3727#endif
3728 done = decode_mbcs(&v, s, (int)size, !consumed);
3729
3730 if (done < 0) {
3731 Py_XDECREF(v);
3732 return NULL;
3733 }
3734
3735 if (consumed)
3736 *consumed += done;
3737
3738#ifdef NEED_RETRY
3739 if (size > INT_MAX) {
3740 s += done;
3741 size -= done;
3742 goto retry;
3743 }
3744#endif
3745
3746 return (PyObject *)v;
3747}
3748
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003749PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003750 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003751 const char *errors)
3752{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003753 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3754}
3755
3756/*
3757 * Convert unicode into string object (MBCS).
3758 * Returns 0 if succeed, -1 otherwise.
3759 */
3760static int encode_mbcs(PyObject **repr,
3761 const Py_UNICODE *p, /* unicode */
3762 int size) /* size of unicode */
3763{
3764 int mbcssize = 0;
3765 Py_ssize_t n = 0;
3766
3767 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003768
3769 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003770 if (size > 0) {
3771 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3772 if (mbcssize == 0) {
3773 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3774 return -1;
3775 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003776 }
3777
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003778 if (*repr == NULL) {
3779 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003780 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003781 if (*repr == NULL)
3782 return -1;
3783 }
3784 else {
3785 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003786 n = PyBytes_Size(*repr);
3787 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003788 return -1;
3789 }
3790
3791 /* Do the conversion */
3792 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003793 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003794 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3795 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3796 return -1;
3797 }
3798 }
3799
3800 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003801}
3802
3803PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003804 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003805 const char *errors)
3806{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003807 PyObject *repr = NULL;
3808 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003809
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003810#ifdef NEED_RETRY
3811 retry:
3812 if (size > INT_MAX)
3813 ret = encode_mbcs(&repr, p, INT_MAX);
3814 else
3815#endif
3816 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003817
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003818 if (ret < 0) {
3819 Py_XDECREF(repr);
3820 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003821 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003822
3823#ifdef NEED_RETRY
3824 if (size > INT_MAX) {
3825 p += INT_MAX;
3826 size -= INT_MAX;
3827 goto retry;
3828 }
3829#endif
3830
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003831 return repr;
3832}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003833
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003834PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3835{
3836 if (!PyUnicode_Check(unicode)) {
3837 PyErr_BadArgument();
3838 return NULL;
3839 }
3840 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3841 PyUnicode_GET_SIZE(unicode),
3842 NULL);
3843}
3844
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003845#undef NEED_RETRY
3846
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003847#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003848
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849/* --- Character Mapping Codec -------------------------------------------- */
3850
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003852 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 PyObject *mapping,
3854 const char *errors)
3855{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003857 Py_ssize_t startinpos;
3858 Py_ssize_t endinpos;
3859 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 PyUnicodeObject *v;
3862 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003863 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 PyObject *errorHandler = NULL;
3865 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003866 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003867 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003868
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 /* Default to Latin-1 */
3870 if (mapping == NULL)
3871 return PyUnicode_DecodeLatin1(s, size, errors);
3872
3873 v = _PyUnicode_New(size);
3874 if (v == NULL)
3875 goto onError;
3876 if (size == 0)
3877 return (PyObject *)v;
3878 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003880 if (PyUnicode_CheckExact(mapping)) {
3881 mapstring = PyUnicode_AS_UNICODE(mapping);
3882 maplen = PyUnicode_GET_SIZE(mapping);
3883 while (s < e) {
3884 unsigned char ch = *s;
3885 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003887 if (ch < maplen)
3888 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003890 if (x == 0xfffe) {
3891 /* undefined mapping */
3892 outpos = p-PyUnicode_AS_UNICODE(v);
3893 startinpos = s-starts;
3894 endinpos = startinpos+1;
3895 if (unicode_decode_call_errorhandler(
3896 errors, &errorHandler,
3897 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003898 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003899 (PyObject **)&v, &outpos, &p)) {
3900 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003901 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003902 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003903 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003904 *p++ = x;
3905 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003907 }
3908 else {
3909 while (s < e) {
3910 unsigned char ch = *s;
3911 PyObject *w, *x;
3912
3913 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3914 w = PyInt_FromLong((long)ch);
3915 if (w == NULL)
3916 goto onError;
3917 x = PyObject_GetItem(mapping, w);
3918 Py_DECREF(w);
3919 if (x == NULL) {
3920 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3921 /* No mapping found means: mapping is undefined. */
3922 PyErr_Clear();
3923 x = Py_None;
3924 Py_INCREF(x);
3925 } else
3926 goto onError;
3927 }
3928
3929 /* Apply mapping */
3930 if (PyInt_Check(x)) {
3931 long value = PyInt_AS_LONG(x);
3932 if (value < 0 || value > 65535) {
3933 PyErr_SetString(PyExc_TypeError,
3934 "character mapping must be in range(65536)");
3935 Py_DECREF(x);
3936 goto onError;
3937 }
3938 *p++ = (Py_UNICODE)value;
3939 }
3940 else if (x == Py_None) {
3941 /* undefined mapping */
3942 outpos = p-PyUnicode_AS_UNICODE(v);
3943 startinpos = s-starts;
3944 endinpos = startinpos+1;
3945 if (unicode_decode_call_errorhandler(
3946 errors, &errorHandler,
3947 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003948 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003949 (PyObject **)&v, &outpos, &p)) {
3950 Py_DECREF(x);
3951 goto onError;
3952 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003953 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003954 continue;
3955 }
3956 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003957 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003958
3959 if (targetsize == 1)
3960 /* 1-1 mapping */
3961 *p++ = *PyUnicode_AS_UNICODE(x);
3962
3963 else if (targetsize > 1) {
3964 /* 1-n mapping */
3965 if (targetsize > extrachars) {
3966 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003967 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3968 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003969 (targetsize << 2);
3970 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003971 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003972 if (_PyUnicode_Resize(&v,
3973 PyUnicode_GET_SIZE(v) + needed) < 0) {
3974 Py_DECREF(x);
3975 goto onError;
3976 }
3977 p = PyUnicode_AS_UNICODE(v) + oldpos;
3978 }
3979 Py_UNICODE_COPY(p,
3980 PyUnicode_AS_UNICODE(x),
3981 targetsize);
3982 p += targetsize;
3983 extrachars -= targetsize;
3984 }
3985 /* 1-0 mapping: skip the character */
3986 }
3987 else {
3988 /* wrong return value */
3989 PyErr_SetString(PyExc_TypeError,
3990 "character mapping must return integer, None or unicode");
3991 Py_DECREF(x);
3992 goto onError;
3993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003995 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 }
3998 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003999 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 Py_XDECREF(errorHandler);
4002 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004004
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004006 Py_XDECREF(errorHandler);
4007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 Py_XDECREF(v);
4009 return NULL;
4010}
4011
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004012/* Charmap encoding: the lookup table */
4013
4014struct encoding_map{
4015 PyObject_HEAD
4016 unsigned char level1[32];
4017 int count2, count3;
4018 unsigned char level23[1];
4019};
4020
4021static PyObject*
4022encoding_map_size(PyObject *obj, PyObject* args)
4023{
4024 struct encoding_map *map = (struct encoding_map*)obj;
4025 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4026 128*map->count3);
4027}
4028
4029static PyMethodDef encoding_map_methods[] = {
4030 {"size", encoding_map_size, METH_NOARGS,
4031 PyDoc_STR("Return the size (in bytes) of this object") },
4032 { 0 }
4033};
4034
4035static void
4036encoding_map_dealloc(PyObject* o)
4037{
4038 PyObject_FREE(o);
4039}
4040
4041static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004042 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004043 "EncodingMap", /*tp_name*/
4044 sizeof(struct encoding_map), /*tp_basicsize*/
4045 0, /*tp_itemsize*/
4046 /* methods */
4047 encoding_map_dealloc, /*tp_dealloc*/
4048 0, /*tp_print*/
4049 0, /*tp_getattr*/
4050 0, /*tp_setattr*/
4051 0, /*tp_compare*/
4052 0, /*tp_repr*/
4053 0, /*tp_as_number*/
4054 0, /*tp_as_sequence*/
4055 0, /*tp_as_mapping*/
4056 0, /*tp_hash*/
4057 0, /*tp_call*/
4058 0, /*tp_str*/
4059 0, /*tp_getattro*/
4060 0, /*tp_setattro*/
4061 0, /*tp_as_buffer*/
4062 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4063 0, /*tp_doc*/
4064 0, /*tp_traverse*/
4065 0, /*tp_clear*/
4066 0, /*tp_richcompare*/
4067 0, /*tp_weaklistoffset*/
4068 0, /*tp_iter*/
4069 0, /*tp_iternext*/
4070 encoding_map_methods, /*tp_methods*/
4071 0, /*tp_members*/
4072 0, /*tp_getset*/
4073 0, /*tp_base*/
4074 0, /*tp_dict*/
4075 0, /*tp_descr_get*/
4076 0, /*tp_descr_set*/
4077 0, /*tp_dictoffset*/
4078 0, /*tp_init*/
4079 0, /*tp_alloc*/
4080 0, /*tp_new*/
4081 0, /*tp_free*/
4082 0, /*tp_is_gc*/
4083};
4084
4085PyObject*
4086PyUnicode_BuildEncodingMap(PyObject* string)
4087{
4088 Py_UNICODE *decode;
4089 PyObject *result;
4090 struct encoding_map *mresult;
4091 int i;
4092 int need_dict = 0;
4093 unsigned char level1[32];
4094 unsigned char level2[512];
4095 unsigned char *mlevel1, *mlevel2, *mlevel3;
4096 int count2 = 0, count3 = 0;
4097
4098 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4099 PyErr_BadArgument();
4100 return NULL;
4101 }
4102 decode = PyUnicode_AS_UNICODE(string);
4103 memset(level1, 0xFF, sizeof level1);
4104 memset(level2, 0xFF, sizeof level2);
4105
4106 /* If there isn't a one-to-one mapping of NULL to \0,
4107 or if there are non-BMP characters, we need to use
4108 a mapping dictionary. */
4109 if (decode[0] != 0)
4110 need_dict = 1;
4111 for (i = 1; i < 256; i++) {
4112 int l1, l2;
4113 if (decode[i] == 0
4114 #ifdef Py_UNICODE_WIDE
4115 || decode[i] > 0xFFFF
4116 #endif
4117 ) {
4118 need_dict = 1;
4119 break;
4120 }
4121 if (decode[i] == 0xFFFE)
4122 /* unmapped character */
4123 continue;
4124 l1 = decode[i] >> 11;
4125 l2 = decode[i] >> 7;
4126 if (level1[l1] == 0xFF)
4127 level1[l1] = count2++;
4128 if (level2[l2] == 0xFF)
4129 level2[l2] = count3++;
4130 }
4131
4132 if (count2 >= 0xFF || count3 >= 0xFF)
4133 need_dict = 1;
4134
4135 if (need_dict) {
4136 PyObject *result = PyDict_New();
4137 PyObject *key, *value;
4138 if (!result)
4139 return NULL;
4140 for (i = 0; i < 256; i++) {
4141 key = value = NULL;
4142 key = PyInt_FromLong(decode[i]);
4143 value = PyInt_FromLong(i);
4144 if (!key || !value)
4145 goto failed1;
4146 if (PyDict_SetItem(result, key, value) == -1)
4147 goto failed1;
4148 Py_DECREF(key);
4149 Py_DECREF(value);
4150 }
4151 return result;
4152 failed1:
4153 Py_XDECREF(key);
4154 Py_XDECREF(value);
4155 Py_DECREF(result);
4156 return NULL;
4157 }
4158
4159 /* Create a three-level trie */
4160 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4161 16*count2 + 128*count3 - 1);
4162 if (!result)
4163 return PyErr_NoMemory();
4164 PyObject_Init(result, &EncodingMapType);
4165 mresult = (struct encoding_map*)result;
4166 mresult->count2 = count2;
4167 mresult->count3 = count3;
4168 mlevel1 = mresult->level1;
4169 mlevel2 = mresult->level23;
4170 mlevel3 = mresult->level23 + 16*count2;
4171 memcpy(mlevel1, level1, 32);
4172 memset(mlevel2, 0xFF, 16*count2);
4173 memset(mlevel3, 0, 128*count3);
4174 count3 = 0;
4175 for (i = 1; i < 256; i++) {
4176 int o1, o2, o3, i2, i3;
4177 if (decode[i] == 0xFFFE)
4178 /* unmapped character */
4179 continue;
4180 o1 = decode[i]>>11;
4181 o2 = (decode[i]>>7) & 0xF;
4182 i2 = 16*mlevel1[o1] + o2;
4183 if (mlevel2[i2] == 0xFF)
4184 mlevel2[i2] = count3++;
4185 o3 = decode[i] & 0x7F;
4186 i3 = 128*mlevel2[i2] + o3;
4187 mlevel3[i3] = i;
4188 }
4189 return result;
4190}
4191
4192static int
4193encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4194{
4195 struct encoding_map *map = (struct encoding_map*)mapping;
4196 int l1 = c>>11;
4197 int l2 = (c>>7) & 0xF;
4198 int l3 = c & 0x7F;
4199 int i;
4200
4201#ifdef Py_UNICODE_WIDE
4202 if (c > 0xFFFF) {
4203 return -1;
4204 }
4205#endif
4206 if (c == 0)
4207 return 0;
4208 /* level 1*/
4209 i = map->level1[l1];
4210 if (i == 0xFF) {
4211 return -1;
4212 }
4213 /* level 2*/
4214 i = map->level23[16*i+l2];
4215 if (i == 0xFF) {
4216 return -1;
4217 }
4218 /* level 3 */
4219 i = map->level23[16*map->count2 + 128*i + l3];
4220 if (i == 0) {
4221 return -1;
4222 }
4223 return i;
4224}
4225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226/* Lookup the character ch in the mapping. If the character
4227 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004228 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 PyObject *w = PyInt_FromLong((long)c);
4232 PyObject *x;
4233
4234 if (w == NULL)
4235 return NULL;
4236 x = PyObject_GetItem(mapping, w);
4237 Py_DECREF(w);
4238 if (x == NULL) {
4239 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4240 /* No mapping found means: mapping is undefined. */
4241 PyErr_Clear();
4242 x = Py_None;
4243 Py_INCREF(x);
4244 return x;
4245 } else
4246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004248 else if (x == Py_None)
4249 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250 else if (PyInt_Check(x)) {
4251 long value = PyInt_AS_LONG(x);
4252 if (value < 0 || value > 255) {
4253 PyErr_SetString(PyExc_TypeError,
4254 "character mapping must be in range(256)");
4255 Py_DECREF(x);
4256 return NULL;
4257 }
4258 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 else if (PyString_Check(x))
4261 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004264 PyErr_Format(PyExc_TypeError,
4265 "character mapping must return integer, None or str8, not %.400s",
4266 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 Py_DECREF(x);
4268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269 }
4270}
4271
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004272static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004273charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004274{
Walter Dörwald827b0552007-05-12 13:23:53 +00004275 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004276 /* exponentially overallocate to minimize reallocations */
4277 if (requiredsize < 2*outsize)
4278 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004279 if (PyBytes_Resize(outobj, requiredsize)) {
4280 Py_DECREF(outobj);
4281 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004282 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004283 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004284}
4285
4286typedef enum charmapencode_result {
4287 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4288}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004290 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 space is available. Return a new reference to the object that
4292 was put in the output buffer, or Py_None, if the mapping was undefined
4293 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004294 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004296charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004297 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004299 PyObject *rep;
4300 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004301 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004303 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004304 int res = encoding_map_lookup(c, mapping);
4305 Py_ssize_t requiredsize = *outpos+1;
4306 if (res == -1)
4307 return enc_FAILED;
4308 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004309 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004310 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004311 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004312 outstart[(*outpos)++] = (char)res;
4313 return enc_SUCCESS;
4314 }
4315
4316 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004318 return enc_EXCEPTION;
4319 else if (rep==Py_None) {
4320 Py_DECREF(rep);
4321 return enc_FAILED;
4322 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004323 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004324 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004325 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004326 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004328 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004330 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4332 }
4333 else {
4334 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004335 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4336 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004337 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004338 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004340 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004342 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004343 memcpy(outstart + *outpos, repchars, repsize);
4344 *outpos += repsize;
4345 }
4346 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004347 Py_DECREF(rep);
4348 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004349}
4350
4351/* handle an error in PyUnicode_EncodeCharmap
4352 Return 0 on success, -1 on error */
4353static
4354int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004355 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004357 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004358 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004359{
4360 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004361 Py_ssize_t repsize;
4362 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 Py_UNICODE *uni2;
4364 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004365 Py_ssize_t collstartpos = *inpos;
4366 Py_ssize_t collendpos = *inpos+1;
4367 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 char *encoding = "charmap";
4369 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004370 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372 /* find all unencodable characters */
4373 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004374 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004375 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004376 int res = encoding_map_lookup(p[collendpos], mapping);
4377 if (res != -1)
4378 break;
4379 ++collendpos;
4380 continue;
4381 }
4382
4383 rep = charmapencode_lookup(p[collendpos], mapping);
4384 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004386 else if (rep!=Py_None) {
4387 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 break;
4389 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004390 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 ++collendpos;
4392 }
4393 /* cache callback name lookup
4394 * (if not done yet, i.e. it's the first error) */
4395 if (*known_errorHandler==-1) {
4396 if ((errors==NULL) || (!strcmp(errors, "strict")))
4397 *known_errorHandler = 1;
4398 else if (!strcmp(errors, "replace"))
4399 *known_errorHandler = 2;
4400 else if (!strcmp(errors, "ignore"))
4401 *known_errorHandler = 3;
4402 else if (!strcmp(errors, "xmlcharrefreplace"))
4403 *known_errorHandler = 4;
4404 else
4405 *known_errorHandler = 0;
4406 }
4407 switch (*known_errorHandler) {
4408 case 1: /* strict */
4409 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4410 return -1;
4411 case 2: /* replace */
4412 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4413 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004414 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 return -1;
4416 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004417 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4419 return -1;
4420 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 }
4422 /* fall through */
4423 case 3: /* ignore */
4424 *inpos = collendpos;
4425 break;
4426 case 4: /* xmlcharrefreplace */
4427 /* generate replacement (temporarily (mis)uses p) */
4428 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4429 char buffer[2+29+1+1];
4430 char *cp;
4431 sprintf(buffer, "&#%d;", (int)p[collpos]);
4432 for (cp = buffer; *cp; ++cp) {
4433 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004434 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004436 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4438 return -1;
4439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 }
4441 }
4442 *inpos = collendpos;
4443 break;
4444 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004445 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 encoding, reason, p, size, exceptionObject,
4447 collstartpos, collendpos, &newpos);
4448 if (repunicode == NULL)
4449 return -1;
4450 /* generate replacement */
4451 repsize = PyUnicode_GET_SIZE(repunicode);
4452 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4453 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004454 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 return -1;
4456 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004457 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4460 return -1;
4461 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 }
4463 *inpos = newpos;
4464 Py_DECREF(repunicode);
4465 }
4466 return 0;
4467}
4468
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004470 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 PyObject *mapping,
4472 const char *errors)
4473{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 /* output object */
4475 PyObject *res = NULL;
4476 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004477 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004479 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 PyObject *errorHandler = NULL;
4481 PyObject *exc = NULL;
4482 /* the following variable is used for caching string comparisons
4483 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4484 * 3=ignore, 4=xmlcharrefreplace */
4485 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486
4487 /* Default to Latin-1 */
4488 if (mapping == NULL)
4489 return PyUnicode_EncodeLatin1(p, size, errors);
4490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 /* allocate enough for a simple encoding without
4492 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004493 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 if (res == NULL)
4495 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004496 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 while (inpos<size) {
4500 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004501 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004502 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004504 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 if (charmap_encoding_error(p, size, &inpos, mapping,
4506 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004507 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004508 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004509 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 else
4513 /* done with this character => adjust input position */
4514 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004518 if (respos<PyBytes_GET_SIZE(res)) {
4519 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 goto onError;
4521 }
4522 Py_XDECREF(exc);
4523 Py_XDECREF(errorHandler);
4524 return res;
4525
4526 onError:
4527 Py_XDECREF(res);
4528 Py_XDECREF(exc);
4529 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 return NULL;
4531}
4532
4533PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4534 PyObject *mapping)
4535{
4536 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4537 PyErr_BadArgument();
4538 return NULL;
4539 }
4540 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4541 PyUnicode_GET_SIZE(unicode),
4542 mapping,
4543 NULL);
4544}
4545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546/* create or adjust a UnicodeTranslateError */
4547static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004548 const Py_UNICODE *unicode, Py_ssize_t size,
4549 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 if (*exceptionObject == NULL) {
4553 *exceptionObject = PyUnicodeTranslateError_Create(
4554 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 }
4556 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4558 goto onError;
4559 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4560 goto onError;
4561 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4562 goto onError;
4563 return;
4564 onError:
4565 Py_DECREF(*exceptionObject);
4566 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 }
4568}
4569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570/* raises a UnicodeTranslateError */
4571static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004572 const Py_UNICODE *unicode, Py_ssize_t size,
4573 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 const char *reason)
4575{
4576 make_translate_exception(exceptionObject,
4577 unicode, size, startpos, endpos, reason);
4578 if (*exceptionObject != NULL)
4579 PyCodec_StrictErrors(*exceptionObject);
4580}
4581
4582/* error handling callback helper:
4583 build arguments, call the callback and check the arguments,
4584 put the result into newpos and return the replacement string, which
4585 has to be freed by the caller */
4586static PyObject *unicode_translate_call_errorhandler(const char *errors,
4587 PyObject **errorHandler,
4588 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004589 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4590 Py_ssize_t startpos, Py_ssize_t endpos,
4591 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004593 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004595 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 PyObject *restuple;
4597 PyObject *resunicode;
4598
4599 if (*errorHandler == NULL) {
4600 *errorHandler = PyCodec_LookupError(errors);
4601 if (*errorHandler == NULL)
4602 return NULL;
4603 }
4604
4605 make_translate_exception(exceptionObject,
4606 unicode, size, startpos, endpos, reason);
4607 if (*exceptionObject == NULL)
4608 return NULL;
4609
4610 restuple = PyObject_CallFunctionObjArgs(
4611 *errorHandler, *exceptionObject, NULL);
4612 if (restuple == NULL)
4613 return NULL;
4614 if (!PyTuple_Check(restuple)) {
4615 PyErr_Format(PyExc_TypeError, &argparse[4]);
4616 Py_DECREF(restuple);
4617 return NULL;
4618 }
4619 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004620 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 Py_DECREF(restuple);
4622 return NULL;
4623 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004624 if (i_newpos<0)
4625 *newpos = size+i_newpos;
4626 else
4627 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004628 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004629 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004630 Py_DECREF(restuple);
4631 return NULL;
4632 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 Py_INCREF(resunicode);
4634 Py_DECREF(restuple);
4635 return resunicode;
4636}
4637
4638/* Lookup the character ch in the mapping and put the result in result,
4639 which must be decrefed by the caller.
4640 Return 0 on success, -1 on error */
4641static
4642int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4643{
4644 PyObject *w = PyInt_FromLong((long)c);
4645 PyObject *x;
4646
4647 if (w == NULL)
4648 return -1;
4649 x = PyObject_GetItem(mapping, w);
4650 Py_DECREF(w);
4651 if (x == NULL) {
4652 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4653 /* No mapping found means: use 1:1 mapping. */
4654 PyErr_Clear();
4655 *result = NULL;
4656 return 0;
4657 } else
4658 return -1;
4659 }
4660 else if (x == Py_None) {
4661 *result = x;
4662 return 0;
4663 }
4664 else if (PyInt_Check(x)) {
4665 long value = PyInt_AS_LONG(x);
4666 long max = PyUnicode_GetMax();
4667 if (value < 0 || value > max) {
4668 PyErr_Format(PyExc_TypeError,
4669 "character mapping must be in range(0x%lx)", max+1);
4670 Py_DECREF(x);
4671 return -1;
4672 }
4673 *result = x;
4674 return 0;
4675 }
4676 else if (PyUnicode_Check(x)) {
4677 *result = x;
4678 return 0;
4679 }
4680 else {
4681 /* wrong return value */
4682 PyErr_SetString(PyExc_TypeError,
4683 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004684 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 return -1;
4686 }
4687}
4688/* ensure that *outobj is at least requiredsize characters long,
4689if not reallocate and adjust various state variables.
4690Return 0 on success, -1 on error */
4691static
Walter Dörwald4894c302003-10-24 14:25:28 +00004692int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004693 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004694{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004695 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004696 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004698 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004700 if (requiredsize < 2 * oldsize)
4701 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004702 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 return -1;
4704 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 }
4706 return 0;
4707}
4708/* lookup the character, put the result in the output string and adjust
4709 various state variables. Return a new reference to the object that
4710 was put in the output buffer in *result, or Py_None, if the mapping was
4711 undefined (in which case no character was written).
4712 The called must decref result.
4713 Return 0 on success, -1 on error. */
4714static
Walter Dörwald4894c302003-10-24 14:25:28 +00004715int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004716 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004717 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718{
Walter Dörwald4894c302003-10-24 14:25:28 +00004719 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 return -1;
4721 if (*res==NULL) {
4722 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004723 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004724 }
4725 else if (*res==Py_None)
4726 ;
4727 else if (PyInt_Check(*res)) {
4728 /* no overflow check, because we know that the space is enough */
4729 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4730 }
4731 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004732 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 if (repsize==1) {
4734 /* no overflow check, because we know that the space is enough */
4735 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4736 }
4737 else if (repsize!=0) {
4738 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004739 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004740 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004741 repsize - 1;
4742 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 return -1;
4744 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4745 *outp += repsize;
4746 }
4747 }
4748 else
4749 return -1;
4750 return 0;
4751}
4752
4753PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004754 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 PyObject *mapping,
4756 const char *errors)
4757{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 /* output object */
4759 PyObject *res = NULL;
4760 /* pointers to the beginning and end+1 of input */
4761 const Py_UNICODE *startp = p;
4762 const Py_UNICODE *endp = p + size;
4763 /* pointer into the output */
4764 Py_UNICODE *str;
4765 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 char *reason = "character maps to <undefined>";
4768 PyObject *errorHandler = NULL;
4769 PyObject *exc = NULL;
4770 /* the following variable is used for caching string comparisons
4771 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4772 * 3=ignore, 4=xmlcharrefreplace */
4773 int known_errorHandler = -1;
4774
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 if (mapping == NULL) {
4776 PyErr_BadArgument();
4777 return NULL;
4778 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779
4780 /* allocate enough for a simple 1:1 translation without
4781 replacements, if we need more, we'll resize */
4782 res = PyUnicode_FromUnicode(NULL, size);
4783 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004784 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 return res;
4787 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 while (p<endp) {
4790 /* try to encode it */
4791 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004792 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004793 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 goto onError;
4795 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004796 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797 if (x!=Py_None) /* it worked => adjust input pointer */
4798 ++p;
4799 else { /* untranslatable character */
4800 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004801 Py_ssize_t repsize;
4802 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 Py_UNICODE *uni2;
4804 /* startpos for collecting untranslatable chars */
4805 const Py_UNICODE *collstart = p;
4806 const Py_UNICODE *collend = p+1;
4807 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 /* find all untranslatable characters */
4810 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004811 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 goto onError;
4813 Py_XDECREF(x);
4814 if (x!=Py_None)
4815 break;
4816 ++collend;
4817 }
4818 /* cache callback name lookup
4819 * (if not done yet, i.e. it's the first error) */
4820 if (known_errorHandler==-1) {
4821 if ((errors==NULL) || (!strcmp(errors, "strict")))
4822 known_errorHandler = 1;
4823 else if (!strcmp(errors, "replace"))
4824 known_errorHandler = 2;
4825 else if (!strcmp(errors, "ignore"))
4826 known_errorHandler = 3;
4827 else if (!strcmp(errors, "xmlcharrefreplace"))
4828 known_errorHandler = 4;
4829 else
4830 known_errorHandler = 0;
4831 }
4832 switch (known_errorHandler) {
4833 case 1: /* strict */
4834 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4835 goto onError;
4836 case 2: /* replace */
4837 /* No need to check for space, this is a 1:1 replacement */
4838 for (coll = collstart; coll<collend; ++coll)
4839 *str++ = '?';
4840 /* fall through */
4841 case 3: /* ignore */
4842 p = collend;
4843 break;
4844 case 4: /* xmlcharrefreplace */
4845 /* generate replacement (temporarily (mis)uses p) */
4846 for (p = collstart; p < collend; ++p) {
4847 char buffer[2+29+1+1];
4848 char *cp;
4849 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004850 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4852 goto onError;
4853 for (cp = buffer; *cp; ++cp)
4854 *str++ = *cp;
4855 }
4856 p = collend;
4857 break;
4858 default:
4859 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4860 reason, startp, size, &exc,
4861 collstart-startp, collend-startp, &newpos);
4862 if (repunicode == NULL)
4863 goto onError;
4864 /* generate replacement */
4865 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004866 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4868 Py_DECREF(repunicode);
4869 goto onError;
4870 }
4871 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4872 *str++ = *uni2;
4873 p = startp + newpos;
4874 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 }
4876 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 /* Resize if we allocated to much */
4879 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004880 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004881 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 }
4884 Py_XDECREF(exc);
4885 Py_XDECREF(errorHandler);
4886 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004888 onError:
4889 Py_XDECREF(res);
4890 Py_XDECREF(exc);
4891 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 return NULL;
4893}
4894
4895PyObject *PyUnicode_Translate(PyObject *str,
4896 PyObject *mapping,
4897 const char *errors)
4898{
4899 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004900
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 str = PyUnicode_FromObject(str);
4902 if (str == NULL)
4903 goto onError;
4904 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4905 PyUnicode_GET_SIZE(str),
4906 mapping,
4907 errors);
4908 Py_DECREF(str);
4909 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004910
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 onError:
4912 Py_XDECREF(str);
4913 return NULL;
4914}
Tim Petersced69f82003-09-16 20:30:58 +00004915
Guido van Rossum9e896b32000-04-05 20:11:21 +00004916/* --- Decimal Encoder ---------------------------------------------------- */
4917
4918int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004919 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004920 char *output,
4921 const char *errors)
4922{
4923 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924 PyObject *errorHandler = NULL;
4925 PyObject *exc = NULL;
4926 const char *encoding = "decimal";
4927 const char *reason = "invalid decimal Unicode string";
4928 /* the following variable is used for caching string comparisons
4929 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4930 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004931
4932 if (output == NULL) {
4933 PyErr_BadArgument();
4934 return -1;
4935 }
4936
4937 p = s;
4938 end = s + length;
4939 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004940 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004941 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004943 Py_ssize_t repsize;
4944 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945 Py_UNICODE *uni2;
4946 Py_UNICODE *collstart;
4947 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004948
Guido van Rossum9e896b32000-04-05 20:11:21 +00004949 if (Py_UNICODE_ISSPACE(ch)) {
4950 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004952 continue;
4953 }
4954 decimal = Py_UNICODE_TODECIMAL(ch);
4955 if (decimal >= 0) {
4956 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004958 continue;
4959 }
Guido van Rossumba477042000-04-06 18:18:10 +00004960 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004961 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004962 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004963 continue;
4964 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004965 /* All other characters are considered unencodable */
4966 collstart = p;
4967 collend = p+1;
4968 while (collend < end) {
4969 if ((0 < *collend && *collend < 256) ||
4970 !Py_UNICODE_ISSPACE(*collend) ||
4971 Py_UNICODE_TODECIMAL(*collend))
4972 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004973 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004974 /* cache callback name lookup
4975 * (if not done yet, i.e. it's the first error) */
4976 if (known_errorHandler==-1) {
4977 if ((errors==NULL) || (!strcmp(errors, "strict")))
4978 known_errorHandler = 1;
4979 else if (!strcmp(errors, "replace"))
4980 known_errorHandler = 2;
4981 else if (!strcmp(errors, "ignore"))
4982 known_errorHandler = 3;
4983 else if (!strcmp(errors, "xmlcharrefreplace"))
4984 known_errorHandler = 4;
4985 else
4986 known_errorHandler = 0;
4987 }
4988 switch (known_errorHandler) {
4989 case 1: /* strict */
4990 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4991 goto onError;
4992 case 2: /* replace */
4993 for (p = collstart; p < collend; ++p)
4994 *output++ = '?';
4995 /* fall through */
4996 case 3: /* ignore */
4997 p = collend;
4998 break;
4999 case 4: /* xmlcharrefreplace */
5000 /* generate replacement (temporarily (mis)uses p) */
5001 for (p = collstart; p < collend; ++p)
5002 output += sprintf(output, "&#%d;", (int)*p);
5003 p = collend;
5004 break;
5005 default:
5006 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5007 encoding, reason, s, length, &exc,
5008 collstart-s, collend-s, &newpos);
5009 if (repunicode == NULL)
5010 goto onError;
5011 /* generate replacement */
5012 repsize = PyUnicode_GET_SIZE(repunicode);
5013 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5014 Py_UNICODE ch = *uni2;
5015 if (Py_UNICODE_ISSPACE(ch))
5016 *output++ = ' ';
5017 else {
5018 decimal = Py_UNICODE_TODECIMAL(ch);
5019 if (decimal >= 0)
5020 *output++ = '0' + decimal;
5021 else if (0 < ch && ch < 256)
5022 *output++ = (char)ch;
5023 else {
5024 Py_DECREF(repunicode);
5025 raise_encode_exception(&exc, encoding,
5026 s, length, collstart-s, collend-s, reason);
5027 goto onError;
5028 }
5029 }
5030 }
5031 p = s + newpos;
5032 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005033 }
5034 }
5035 /* 0-terminate the output string */
5036 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 Py_XDECREF(exc);
5038 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005039 return 0;
5040
5041 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005042 Py_XDECREF(exc);
5043 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005044 return -1;
5045}
5046
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047/* --- Helpers ------------------------------------------------------------ */
5048
Eric Smith8c663262007-08-25 02:26:07 +00005049#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005050
5051#include "stringlib/fastsearch.h"
5052
5053#include "stringlib/count.h"
5054#include "stringlib/find.h"
5055#include "stringlib/partition.h"
5056
5057/* helper macro to fixup start/end slice values */
5058#define FIX_START_END(obj) \
5059 if (start < 0) \
5060 start += (obj)->length; \
5061 if (start < 0) \
5062 start = 0; \
5063 if (end > (obj)->length) \
5064 end = (obj)->length; \
5065 if (end < 0) \
5066 end += (obj)->length; \
5067 if (end < 0) \
5068 end = 0;
5069
Martin v. Löwis18e16552006-02-15 17:27:45 +00005070Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005071 PyObject *substr,
5072 Py_ssize_t start,
5073 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005075 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005076 PyUnicodeObject* str_obj;
5077 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005078
Thomas Wouters477c8d52006-05-27 19:21:47 +00005079 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5080 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005082 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5083 if (!sub_obj) {
5084 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 return -1;
5086 }
Tim Petersced69f82003-09-16 20:30:58 +00005087
Thomas Wouters477c8d52006-05-27 19:21:47 +00005088 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005089
Thomas Wouters477c8d52006-05-27 19:21:47 +00005090 result = stringlib_count(
5091 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5092 );
5093
5094 Py_DECREF(sub_obj);
5095 Py_DECREF(str_obj);
5096
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 return result;
5098}
5099
Martin v. Löwis18e16552006-02-15 17:27:45 +00005100Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005101 PyObject *sub,
5102 Py_ssize_t start,
5103 Py_ssize_t end,
5104 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005106 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005107
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005109 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005110 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005111 sub = PyUnicode_FromObject(sub);
5112 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005113 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005114 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 }
Tim Petersced69f82003-09-16 20:30:58 +00005116
Thomas Wouters477c8d52006-05-27 19:21:47 +00005117 if (direction > 0)
5118 result = stringlib_find_slice(
5119 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5120 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5121 start, end
5122 );
5123 else
5124 result = stringlib_rfind_slice(
5125 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5126 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5127 start, end
5128 );
5129
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005131 Py_DECREF(sub);
5132
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 return result;
5134}
5135
Tim Petersced69f82003-09-16 20:30:58 +00005136static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137int tailmatch(PyUnicodeObject *self,
5138 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005139 Py_ssize_t start,
5140 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 int direction)
5142{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 if (substring->length == 0)
5144 return 1;
5145
Thomas Wouters477c8d52006-05-27 19:21:47 +00005146 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147
5148 end -= substring->length;
5149 if (end < start)
5150 return 0;
5151
5152 if (direction > 0) {
5153 if (Py_UNICODE_MATCH(self, end, substring))
5154 return 1;
5155 } else {
5156 if (Py_UNICODE_MATCH(self, start, substring))
5157 return 1;
5158 }
5159
5160 return 0;
5161}
5162
Martin v. Löwis18e16552006-02-15 17:27:45 +00005163Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005165 Py_ssize_t start,
5166 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 int direction)
5168{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005169 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005170
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 str = PyUnicode_FromObject(str);
5172 if (str == NULL)
5173 return -1;
5174 substr = PyUnicode_FromObject(substr);
5175 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005176 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 return -1;
5178 }
Tim Petersced69f82003-09-16 20:30:58 +00005179
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 result = tailmatch((PyUnicodeObject *)str,
5181 (PyUnicodeObject *)substr,
5182 start, end, direction);
5183 Py_DECREF(str);
5184 Py_DECREF(substr);
5185 return result;
5186}
5187
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188/* Apply fixfct filter to the Unicode object self and return a
5189 reference to the modified object */
5190
Tim Petersced69f82003-09-16 20:30:58 +00005191static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192PyObject *fixup(PyUnicodeObject *self,
5193 int (*fixfct)(PyUnicodeObject *s))
5194{
5195
5196 PyUnicodeObject *u;
5197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005198 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 if (u == NULL)
5200 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005201
5202 Py_UNICODE_COPY(u->str, self->str, self->length);
5203
Tim Peters7a29bd52001-09-12 03:03:31 +00005204 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 /* fixfct should return TRUE if it modified the buffer. If
5206 FALSE, return a reference to the original buffer instead
5207 (to save space, not time) */
5208 Py_INCREF(self);
5209 Py_DECREF(u);
5210 return (PyObject*) self;
5211 }
5212 return (PyObject*) u;
5213}
5214
Tim Petersced69f82003-09-16 20:30:58 +00005215static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216int fixupper(PyUnicodeObject *self)
5217{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005218 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 Py_UNICODE *s = self->str;
5220 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005221
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 while (len-- > 0) {
5223 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005224
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 ch = Py_UNICODE_TOUPPER(*s);
5226 if (ch != *s) {
5227 status = 1;
5228 *s = ch;
5229 }
5230 s++;
5231 }
5232
5233 return status;
5234}
5235
Tim Petersced69f82003-09-16 20:30:58 +00005236static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237int fixlower(PyUnicodeObject *self)
5238{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005239 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 Py_UNICODE *s = self->str;
5241 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005242
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 while (len-- > 0) {
5244 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005245
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 ch = Py_UNICODE_TOLOWER(*s);
5247 if (ch != *s) {
5248 status = 1;
5249 *s = ch;
5250 }
5251 s++;
5252 }
5253
5254 return status;
5255}
5256
Tim Petersced69f82003-09-16 20:30:58 +00005257static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258int fixswapcase(PyUnicodeObject *self)
5259{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005260 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261 Py_UNICODE *s = self->str;
5262 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005263
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 while (len-- > 0) {
5265 if (Py_UNICODE_ISUPPER(*s)) {
5266 *s = Py_UNICODE_TOLOWER(*s);
5267 status = 1;
5268 } else if (Py_UNICODE_ISLOWER(*s)) {
5269 *s = Py_UNICODE_TOUPPER(*s);
5270 status = 1;
5271 }
5272 s++;
5273 }
5274
5275 return status;
5276}
5277
Tim Petersced69f82003-09-16 20:30:58 +00005278static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279int fixcapitalize(PyUnicodeObject *self)
5280{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005281 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005282 Py_UNICODE *s = self->str;
5283 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005284
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005285 if (len == 0)
5286 return 0;
5287 if (Py_UNICODE_ISLOWER(*s)) {
5288 *s = Py_UNICODE_TOUPPER(*s);
5289 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005291 s++;
5292 while (--len > 0) {
5293 if (Py_UNICODE_ISUPPER(*s)) {
5294 *s = Py_UNICODE_TOLOWER(*s);
5295 status = 1;
5296 }
5297 s++;
5298 }
5299 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300}
5301
5302static
5303int fixtitle(PyUnicodeObject *self)
5304{
5305 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5306 register Py_UNICODE *e;
5307 int previous_is_cased;
5308
5309 /* Shortcut for single character strings */
5310 if (PyUnicode_GET_SIZE(self) == 1) {
5311 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5312 if (*p != ch) {
5313 *p = ch;
5314 return 1;
5315 }
5316 else
5317 return 0;
5318 }
Tim Petersced69f82003-09-16 20:30:58 +00005319
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320 e = p + PyUnicode_GET_SIZE(self);
5321 previous_is_cased = 0;
5322 for (; p < e; p++) {
5323 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005324
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 if (previous_is_cased)
5326 *p = Py_UNICODE_TOLOWER(ch);
5327 else
5328 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005329
5330 if (Py_UNICODE_ISLOWER(ch) ||
5331 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 Py_UNICODE_ISTITLE(ch))
5333 previous_is_cased = 1;
5334 else
5335 previous_is_cased = 0;
5336 }
5337 return 1;
5338}
5339
Tim Peters8ce9f162004-08-27 01:49:32 +00005340PyObject *
5341PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342{
Tim Peters8ce9f162004-08-27 01:49:32 +00005343 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005344 const Py_UNICODE blank = ' ';
5345 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005346 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005347 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005348 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5349 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005350 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5351 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005353 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005354 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355
Tim Peters05eba1f2004-08-27 21:32:02 +00005356 fseq = PySequence_Fast(seq, "");
5357 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005358 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005359 }
5360
Tim Peters91879ab2004-08-27 22:35:44 +00005361 /* Grrrr. A codec may be invoked to convert str objects to
5362 * Unicode, and so it's possible to call back into Python code
5363 * during PyUnicode_FromObject(), and so it's possible for a sick
5364 * codec to change the size of fseq (if seq is a list). Therefore
5365 * we have to keep refetching the size -- can't assume seqlen
5366 * is invariant.
5367 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005368 seqlen = PySequence_Fast_GET_SIZE(fseq);
5369 /* If empty sequence, return u"". */
5370 if (seqlen == 0) {
5371 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5372 goto Done;
5373 }
5374 /* If singleton sequence with an exact Unicode, return that. */
5375 if (seqlen == 1) {
5376 item = PySequence_Fast_GET_ITEM(fseq, 0);
5377 if (PyUnicode_CheckExact(item)) {
5378 Py_INCREF(item);
5379 res = (PyUnicodeObject *)item;
5380 goto Done;
5381 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005382 }
5383
Tim Peters05eba1f2004-08-27 21:32:02 +00005384 /* At least two items to join, or one that isn't exact Unicode. */
5385 if (seqlen > 1) {
5386 /* Set up sep and seplen -- they're needed. */
5387 if (separator == NULL) {
5388 sep = &blank;
5389 seplen = 1;
5390 }
5391 else {
5392 internal_separator = PyUnicode_FromObject(separator);
5393 if (internal_separator == NULL)
5394 goto onError;
5395 sep = PyUnicode_AS_UNICODE(internal_separator);
5396 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005397 /* In case PyUnicode_FromObject() mutated seq. */
5398 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005399 }
5400 }
5401
5402 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005403 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005404 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005405 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005406 res_p = PyUnicode_AS_UNICODE(res);
5407 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005408
Tim Peters05eba1f2004-08-27 21:32:02 +00005409 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 Py_ssize_t itemlen;
5411 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005412
5413 item = PySequence_Fast_GET_ITEM(fseq, i);
5414 /* Convert item to Unicode. */
5415 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5416 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005417 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005418 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005419 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005420 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005421 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005422 item = PyUnicode_FromObject(item);
5423 if (item == NULL)
5424 goto onError;
5425 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005426
Tim Peters91879ab2004-08-27 22:35:44 +00005427 /* In case PyUnicode_FromObject() mutated seq. */
5428 seqlen = PySequence_Fast_GET_SIZE(fseq);
5429
Tim Peters8ce9f162004-08-27 01:49:32 +00005430 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005432 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005433 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005434 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005435 if (i < seqlen - 1) {
5436 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005437 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005438 goto Overflow;
5439 }
5440 if (new_res_used > res_alloc) {
5441 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005442 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005443 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005444 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005445 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005446 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005447 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005448 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005450 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005451 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005453
5454 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005455 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005456 res_p += itemlen;
5457 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005458 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005459 res_p += seplen;
5460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005462 res_used = new_res_used;
5463 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005464
Tim Peters05eba1f2004-08-27 21:32:02 +00005465 /* Shrink res to match the used area; this probably can't fail,
5466 * but it's cheap to check.
5467 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005468 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005469 goto onError;
5470
5471 Done:
5472 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005473 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 return (PyObject *)res;
5475
Tim Peters8ce9f162004-08-27 01:49:32 +00005476 Overflow:
5477 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005478 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005479 Py_DECREF(item);
5480 /* fall through */
5481
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005483 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005484 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005485 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 return NULL;
5487}
5488
Tim Petersced69f82003-09-16 20:30:58 +00005489static
5490PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005491 Py_ssize_t left,
5492 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 Py_UNICODE fill)
5494{
5495 PyUnicodeObject *u;
5496
5497 if (left < 0)
5498 left = 0;
5499 if (right < 0)
5500 right = 0;
5501
Tim Peters7a29bd52001-09-12 03:03:31 +00005502 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 Py_INCREF(self);
5504 return self;
5505 }
5506
5507 u = _PyUnicode_New(left + self->length + right);
5508 if (u) {
5509 if (left)
5510 Py_UNICODE_FILL(u->str, fill, left);
5511 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5512 if (right)
5513 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5514 }
5515
5516 return u;
5517}
5518
5519#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005520 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 if (!str) \
5522 goto onError; \
5523 if (PyList_Append(list, str)) { \
5524 Py_DECREF(str); \
5525 goto onError; \
5526 } \
5527 else \
5528 Py_DECREF(str);
5529
5530static
5531PyObject *split_whitespace(PyUnicodeObject *self,
5532 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005533 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005535 register Py_ssize_t i;
5536 register Py_ssize_t j;
5537 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 PyObject *str;
5539
5540 for (i = j = 0; i < len; ) {
5541 /* find a token */
5542 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5543 i++;
5544 j = i;
5545 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5546 i++;
5547 if (j < i) {
5548 if (maxcount-- <= 0)
5549 break;
5550 SPLIT_APPEND(self->str, j, i);
5551 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5552 i++;
5553 j = i;
5554 }
5555 }
5556 if (j < len) {
5557 SPLIT_APPEND(self->str, j, len);
5558 }
5559 return list;
5560
5561 onError:
5562 Py_DECREF(list);
5563 return NULL;
5564}
5565
5566PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005567 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005569 register Py_ssize_t i;
5570 register Py_ssize_t j;
5571 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 PyObject *list;
5573 PyObject *str;
5574 Py_UNICODE *data;
5575
5576 string = PyUnicode_FromObject(string);
5577 if (string == NULL)
5578 return NULL;
5579 data = PyUnicode_AS_UNICODE(string);
5580 len = PyUnicode_GET_SIZE(string);
5581
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 list = PyList_New(0);
5583 if (!list)
5584 goto onError;
5585
5586 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005587 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005588
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005590 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592
5593 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005594 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 if (i < len) {
5596 if (data[i] == '\r' && i + 1 < len &&
5597 data[i+1] == '\n')
5598 i += 2;
5599 else
5600 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005601 if (keepends)
5602 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 }
Guido van Rossum86662912000-04-11 15:38:46 +00005604 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 j = i;
5606 }
5607 if (j < len) {
5608 SPLIT_APPEND(data, j, len);
5609 }
5610
5611 Py_DECREF(string);
5612 return list;
5613
5614 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005615 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 Py_DECREF(string);
5617 return NULL;
5618}
5619
Tim Petersced69f82003-09-16 20:30:58 +00005620static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621PyObject *split_char(PyUnicodeObject *self,
5622 PyObject *list,
5623 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005624 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005626 register Py_ssize_t i;
5627 register Py_ssize_t j;
5628 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 PyObject *str;
5630
5631 for (i = j = 0; i < len; ) {
5632 if (self->str[i] == ch) {
5633 if (maxcount-- <= 0)
5634 break;
5635 SPLIT_APPEND(self->str, j, i);
5636 i = j = i + 1;
5637 } else
5638 i++;
5639 }
5640 if (j <= len) {
5641 SPLIT_APPEND(self->str, j, len);
5642 }
5643 return list;
5644
5645 onError:
5646 Py_DECREF(list);
5647 return NULL;
5648}
5649
Tim Petersced69f82003-09-16 20:30:58 +00005650static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651PyObject *split_substring(PyUnicodeObject *self,
5652 PyObject *list,
5653 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005654 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005656 register Py_ssize_t i;
5657 register Py_ssize_t j;
5658 Py_ssize_t len = self->length;
5659 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 PyObject *str;
5661
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005662 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 if (Py_UNICODE_MATCH(self, i, substring)) {
5664 if (maxcount-- <= 0)
5665 break;
5666 SPLIT_APPEND(self->str, j, i);
5667 i = j = i + sublen;
5668 } else
5669 i++;
5670 }
5671 if (j <= len) {
5672 SPLIT_APPEND(self->str, j, len);
5673 }
5674 return list;
5675
5676 onError:
5677 Py_DECREF(list);
5678 return NULL;
5679}
5680
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005681static
5682PyObject *rsplit_whitespace(PyUnicodeObject *self,
5683 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005684 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005685{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005686 register Py_ssize_t i;
5687 register Py_ssize_t j;
5688 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005689 PyObject *str;
5690
5691 for (i = j = len - 1; i >= 0; ) {
5692 /* find a token */
5693 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5694 i--;
5695 j = i;
5696 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5697 i--;
5698 if (j > i) {
5699 if (maxcount-- <= 0)
5700 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005701 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005702 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5703 i--;
5704 j = i;
5705 }
5706 }
5707 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005708 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005709 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005710 if (PyList_Reverse(list) < 0)
5711 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005712 return list;
5713
5714 onError:
5715 Py_DECREF(list);
5716 return NULL;
5717}
5718
5719static
5720PyObject *rsplit_char(PyUnicodeObject *self,
5721 PyObject *list,
5722 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005723 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005724{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005725 register Py_ssize_t i;
5726 register Py_ssize_t j;
5727 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005728 PyObject *str;
5729
5730 for (i = j = len - 1; i >= 0; ) {
5731 if (self->str[i] == ch) {
5732 if (maxcount-- <= 0)
5733 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005734 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005735 j = i = i - 1;
5736 } else
5737 i--;
5738 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005739 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005740 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005741 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005742 if (PyList_Reverse(list) < 0)
5743 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005744 return list;
5745
5746 onError:
5747 Py_DECREF(list);
5748 return NULL;
5749}
5750
5751static
5752PyObject *rsplit_substring(PyUnicodeObject *self,
5753 PyObject *list,
5754 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005755 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005756{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005757 register Py_ssize_t i;
5758 register Py_ssize_t j;
5759 Py_ssize_t len = self->length;
5760 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005761 PyObject *str;
5762
5763 for (i = len - sublen, j = len; i >= 0; ) {
5764 if (Py_UNICODE_MATCH(self, i, substring)) {
5765 if (maxcount-- <= 0)
5766 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005767 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005768 j = i;
5769 i -= sublen;
5770 } else
5771 i--;
5772 }
5773 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005774 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005776 if (PyList_Reverse(list) < 0)
5777 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778 return list;
5779
5780 onError:
5781 Py_DECREF(list);
5782 return NULL;
5783}
5784
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785#undef SPLIT_APPEND
5786
5787static
5788PyObject *split(PyUnicodeObject *self,
5789 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005790 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791{
5792 PyObject *list;
5793
5794 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005795 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
5797 list = PyList_New(0);
5798 if (!list)
5799 return NULL;
5800
5801 if (substring == NULL)
5802 return split_whitespace(self,list,maxcount);
5803
5804 else if (substring->length == 1)
5805 return split_char(self,list,substring->str[0],maxcount);
5806
5807 else if (substring->length == 0) {
5808 Py_DECREF(list);
5809 PyErr_SetString(PyExc_ValueError, "empty separator");
5810 return NULL;
5811 }
5812 else
5813 return split_substring(self,list,substring,maxcount);
5814}
5815
Tim Petersced69f82003-09-16 20:30:58 +00005816static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817PyObject *rsplit(PyUnicodeObject *self,
5818 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005819 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005820{
5821 PyObject *list;
5822
5823 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005824 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005825
5826 list = PyList_New(0);
5827 if (!list)
5828 return NULL;
5829
5830 if (substring == NULL)
5831 return rsplit_whitespace(self,list,maxcount);
5832
5833 else if (substring->length == 1)
5834 return rsplit_char(self,list,substring->str[0],maxcount);
5835
5836 else if (substring->length == 0) {
5837 Py_DECREF(list);
5838 PyErr_SetString(PyExc_ValueError, "empty separator");
5839 return NULL;
5840 }
5841 else
5842 return rsplit_substring(self,list,substring,maxcount);
5843}
5844
5845static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846PyObject *replace(PyUnicodeObject *self,
5847 PyUnicodeObject *str1,
5848 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005849 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850{
5851 PyUnicodeObject *u;
5852
5853 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005854 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855
Thomas Wouters477c8d52006-05-27 19:21:47 +00005856 if (str1->length == str2->length) {
5857 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005858 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005859 if (str1->length == 1) {
5860 /* replace characters */
5861 Py_UNICODE u1, u2;
5862 if (!findchar(self->str, self->length, str1->str[0]))
5863 goto nothing;
5864 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5865 if (!u)
5866 return NULL;
5867 Py_UNICODE_COPY(u->str, self->str, self->length);
5868 u1 = str1->str[0];
5869 u2 = str2->str[0];
5870 for (i = 0; i < u->length; i++)
5871 if (u->str[i] == u1) {
5872 if (--maxcount < 0)
5873 break;
5874 u->str[i] = u2;
5875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005877 i = fastsearch(
5878 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005880 if (i < 0)
5881 goto nothing;
5882 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5883 if (!u)
5884 return NULL;
5885 Py_UNICODE_COPY(u->str, self->str, self->length);
5886 while (i <= self->length - str1->length)
5887 if (Py_UNICODE_MATCH(self, i, str1)) {
5888 if (--maxcount < 0)
5889 break;
5890 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5891 i += str1->length;
5892 } else
5893 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005896
5897 Py_ssize_t n, i, j, e;
5898 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 Py_UNICODE *p;
5900
5901 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005902 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 if (n > maxcount)
5904 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005905 if (n == 0)
5906 goto nothing;
5907 /* new_size = self->length + n * (str2->length - str1->length)); */
5908 delta = (str2->length - str1->length);
5909 if (delta == 0) {
5910 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005912 product = n * (str2->length - str1->length);
5913 if ((product / (str2->length - str1->length)) != n) {
5914 PyErr_SetString(PyExc_OverflowError,
5915 "replace string is too long");
5916 return NULL;
5917 }
5918 new_size = self->length + product;
5919 if (new_size < 0) {
5920 PyErr_SetString(PyExc_OverflowError,
5921 "replace string is too long");
5922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 }
5924 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005925 u = _PyUnicode_New(new_size);
5926 if (!u)
5927 return NULL;
5928 i = 0;
5929 p = u->str;
5930 e = self->length - str1->length;
5931 if (str1->length > 0) {
5932 while (n-- > 0) {
5933 /* look for next match */
5934 j = i;
5935 while (j <= e) {
5936 if (Py_UNICODE_MATCH(self, j, str1))
5937 break;
5938 j++;
5939 }
5940 if (j > i) {
5941 if (j > e)
5942 break;
5943 /* copy unchanged part [i:j] */
5944 Py_UNICODE_COPY(p, self->str+i, j-i);
5945 p += j - i;
5946 }
5947 /* copy substitution string */
5948 if (str2->length > 0) {
5949 Py_UNICODE_COPY(p, str2->str, str2->length);
5950 p += str2->length;
5951 }
5952 i = j + str1->length;
5953 }
5954 if (i < self->length)
5955 /* copy tail [i:] */
5956 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5957 } else {
5958 /* interleave */
5959 while (n > 0) {
5960 Py_UNICODE_COPY(p, str2->str, str2->length);
5961 p += str2->length;
5962 if (--n <= 0)
5963 break;
5964 *p++ = self->str[i++];
5965 }
5966 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005970
5971nothing:
5972 /* nothing to replace; return original string (when possible) */
5973 if (PyUnicode_CheckExact(self)) {
5974 Py_INCREF(self);
5975 return (PyObject *) self;
5976 }
5977 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978}
5979
5980/* --- Unicode Object Methods --------------------------------------------- */
5981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005982PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983"S.title() -> unicode\n\
5984\n\
5985Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005986characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
5988static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005989unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 return fixup(self, fixtitle);
5992}
5993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005994PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995"S.capitalize() -> unicode\n\
5996\n\
5997Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005998have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
6000static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006001unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 return fixup(self, fixcapitalize);
6004}
6005
6006#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006007PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008"S.capwords() -> unicode\n\
6009\n\
6010Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006011normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
6013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006014unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015{
6016 PyObject *list;
6017 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006018 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 /* Split into words */
6021 list = split(self, NULL, -1);
6022 if (!list)
6023 return NULL;
6024
6025 /* Capitalize each word */
6026 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6027 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6028 fixcapitalize);
6029 if (item == NULL)
6030 goto onError;
6031 Py_DECREF(PyList_GET_ITEM(list, i));
6032 PyList_SET_ITEM(list, i, item);
6033 }
6034
6035 /* Join the words to form a new string */
6036 item = PyUnicode_Join(NULL, list);
6037
6038onError:
6039 Py_DECREF(list);
6040 return (PyObject *)item;
6041}
6042#endif
6043
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006044/* Argument converter. Coerces to a single unicode character */
6045
6046static int
6047convert_uc(PyObject *obj, void *addr)
6048{
6049 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6050 PyObject *uniobj;
6051 Py_UNICODE *unistr;
6052
6053 uniobj = PyUnicode_FromObject(obj);
6054 if (uniobj == NULL) {
6055 PyErr_SetString(PyExc_TypeError,
6056 "The fill character cannot be converted to Unicode");
6057 return 0;
6058 }
6059 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6060 PyErr_SetString(PyExc_TypeError,
6061 "The fill character must be exactly one character long");
6062 Py_DECREF(uniobj);
6063 return 0;
6064 }
6065 unistr = PyUnicode_AS_UNICODE(uniobj);
6066 *fillcharloc = unistr[0];
6067 Py_DECREF(uniobj);
6068 return 1;
6069}
6070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006072"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006074Return S centered in a Unicode string of length width. Padding is\n\
6075done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
6077static PyObject *
6078unicode_center(PyUnicodeObject *self, PyObject *args)
6079{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080 Py_ssize_t marg, left;
6081 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006082 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083
Thomas Woutersde017742006-02-16 19:34:37 +00006084 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 return NULL;
6086
Tim Peters7a29bd52001-09-12 03:03:31 +00006087 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 Py_INCREF(self);
6089 return (PyObject*) self;
6090 }
6091
6092 marg = width - self->length;
6093 left = marg / 2 + (marg & width & 1);
6094
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006095 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096}
6097
Marc-André Lemburge5034372000-08-08 08:04:29 +00006098#if 0
6099
6100/* This code should go into some future Unicode collation support
6101 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006102 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006103
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006104/* speedy UTF-16 code point order comparison */
6105/* gleaned from: */
6106/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6107
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006108static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006109{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006110 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006111 0, 0, 0, 0, 0, 0, 0, 0,
6112 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006113 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006114};
6115
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116static int
6117unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6118{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006119 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006120
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 Py_UNICODE *s1 = str1->str;
6122 Py_UNICODE *s2 = str2->str;
6123
6124 len1 = str1->length;
6125 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006126
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006128 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006129
6130 c1 = *s1++;
6131 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006132
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006133 if (c1 > (1<<11) * 26)
6134 c1 += utf16Fixup[c1>>11];
6135 if (c2 > (1<<11) * 26)
6136 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006137 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006138
6139 if (c1 != c2)
6140 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006141
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006142 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 }
6144
6145 return (len1 < len2) ? -1 : (len1 != len2);
6146}
6147
Marc-André Lemburge5034372000-08-08 08:04:29 +00006148#else
6149
6150static int
6151unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6152{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006153 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006154
6155 Py_UNICODE *s1 = str1->str;
6156 Py_UNICODE *s2 = str2->str;
6157
6158 len1 = str1->length;
6159 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006160
Marc-André Lemburge5034372000-08-08 08:04:29 +00006161 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006162 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006163
Fredrik Lundh45714e92001-06-26 16:39:36 +00006164 c1 = *s1++;
6165 c2 = *s2++;
6166
6167 if (c1 != c2)
6168 return (c1 < c2) ? -1 : 1;
6169
Marc-André Lemburge5034372000-08-08 08:04:29 +00006170 len1--; len2--;
6171 }
6172
6173 return (len1 < len2) ? -1 : (len1 != len2);
6174}
6175
6176#endif
6177
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178int PyUnicode_Compare(PyObject *left,
6179 PyObject *right)
6180{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006181 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6182 return unicode_compare((PyUnicodeObject *)left,
6183 (PyUnicodeObject *)right);
6184 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6185 (PyUnicode_Check(left) && PyString_Check(right))) {
6186 if (PyUnicode_Check(left))
6187 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6188 if (PyUnicode_Check(right))
6189 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6190 assert(PyString_Check(left));
6191 assert(PyString_Check(right));
6192 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006194 PyErr_Format(PyExc_TypeError,
6195 "Can't compare %.100s and %.100s",
6196 left->ob_type->tp_name,
6197 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 return -1;
6199}
6200
Martin v. Löwis5b222132007-06-10 09:51:05 +00006201int
6202PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6203{
6204 int i;
6205 Py_UNICODE *id;
6206 assert(PyUnicode_Check(uni));
6207 id = PyUnicode_AS_UNICODE(uni);
6208 /* Compare Unicode string and source character set string */
6209 for (i = 0; id[i] && str[i]; i++)
6210 if (id[i] != str[i])
6211 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6212 if (id[i])
6213 return 1; /* uni is longer */
6214 if (str[i])
6215 return -1; /* str is longer */
6216 return 0;
6217}
6218
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006219PyObject *PyUnicode_RichCompare(PyObject *left,
6220 PyObject *right,
6221 int op)
6222{
6223 int result;
6224
6225 result = PyUnicode_Compare(left, right);
6226 if (result == -1 && PyErr_Occurred())
6227 goto onError;
6228
6229 /* Convert the return value to a Boolean */
6230 switch (op) {
6231 case Py_EQ:
6232 result = (result == 0);
6233 break;
6234 case Py_NE:
6235 result = (result != 0);
6236 break;
6237 case Py_LE:
6238 result = (result <= 0);
6239 break;
6240 case Py_GE:
6241 result = (result >= 0);
6242 break;
6243 case Py_LT:
6244 result = (result == -1);
6245 break;
6246 case Py_GT:
6247 result = (result == 1);
6248 break;
6249 }
6250 return PyBool_FromLong(result);
6251
6252 onError:
6253
6254 /* Standard case
6255
6256 Type errors mean that PyUnicode_FromObject() could not convert
6257 one of the arguments (usually the right hand side) to Unicode,
6258 ie. we can't handle the comparison request. However, it is
6259 possible that the other object knows a comparison method, which
6260 is why we return Py_NotImplemented to give the other object a
6261 chance.
6262
6263 */
6264 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6265 PyErr_Clear();
6266 Py_INCREF(Py_NotImplemented);
6267 return Py_NotImplemented;
6268 }
6269 if (op != Py_EQ && op != Py_NE)
6270 return NULL;
6271
6272 /* Equality comparison.
6273
6274 This is a special case: we silence any PyExc_UnicodeDecodeError
6275 and instead turn it into a PyErr_UnicodeWarning.
6276
6277 */
6278 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6279 return NULL;
6280 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006281 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6282 (op == Py_EQ) ?
6283 "Unicode equal comparison "
6284 "failed to convert both arguments to Unicode - "
6285 "interpreting them as being unequal"
6286 :
6287 "Unicode unequal comparison "
6288 "failed to convert both arguments to Unicode - "
6289 "interpreting them as being unequal",
6290 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006291 return NULL;
6292 result = (op == Py_NE);
6293 return PyBool_FromLong(result);
6294}
6295
Guido van Rossum403d68b2000-03-13 15:55:09 +00006296int PyUnicode_Contains(PyObject *container,
6297 PyObject *element)
6298{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006299 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006300 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006301
6302 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006303 sub = PyUnicode_FromObject(element);
6304 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006305 PyErr_Format(PyExc_TypeError,
6306 "'in <string>' requires string as left operand, not %s",
6307 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006308 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006309 }
6310
Thomas Wouters477c8d52006-05-27 19:21:47 +00006311 str = PyUnicode_FromObject(container);
6312 if (!str) {
6313 Py_DECREF(sub);
6314 return -1;
6315 }
6316
6317 result = stringlib_contains_obj(str, sub);
6318
6319 Py_DECREF(str);
6320 Py_DECREF(sub);
6321
Guido van Rossum403d68b2000-03-13 15:55:09 +00006322 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006323}
6324
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325/* Concat to string or Unicode object giving a new Unicode object. */
6326
6327PyObject *PyUnicode_Concat(PyObject *left,
6328 PyObject *right)
6329{
6330 PyUnicodeObject *u = NULL, *v = NULL, *w;
6331
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006332 if (PyBytes_Check(left) || PyBytes_Check(right))
6333 return PyBytes_Concat(left, right);
6334
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 /* Coerce the two arguments */
6336 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6337 if (u == NULL)
6338 goto onError;
6339 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6340 if (v == NULL)
6341 goto onError;
6342
6343 /* Shortcuts */
6344 if (v == unicode_empty) {
6345 Py_DECREF(v);
6346 return (PyObject *)u;
6347 }
6348 if (u == unicode_empty) {
6349 Py_DECREF(u);
6350 return (PyObject *)v;
6351 }
6352
6353 /* Concat the two Unicode strings */
6354 w = _PyUnicode_New(u->length + v->length);
6355 if (w == NULL)
6356 goto onError;
6357 Py_UNICODE_COPY(w->str, u->str, u->length);
6358 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6359
6360 Py_DECREF(u);
6361 Py_DECREF(v);
6362 return (PyObject *)w;
6363
6364onError:
6365 Py_XDECREF(u);
6366 Py_XDECREF(v);
6367 return NULL;
6368}
6369
Walter Dörwald1ab83302007-05-18 17:15:44 +00006370void
6371PyUnicode_Append(PyObject **pleft, PyObject *right)
6372{
6373 PyObject *new;
6374 if (*pleft == NULL)
6375 return;
6376 if (right == NULL || !PyUnicode_Check(*pleft)) {
6377 Py_DECREF(*pleft);
6378 *pleft = NULL;
6379 return;
6380 }
6381 new = PyUnicode_Concat(*pleft, right);
6382 Py_DECREF(*pleft);
6383 *pleft = new;
6384}
6385
6386void
6387PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6388{
6389 PyUnicode_Append(pleft, right);
6390 Py_XDECREF(right);
6391}
6392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394"S.count(sub[, start[, end]]) -> int\n\
6395\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006396Return the number of non-overlapping occurrences of substring sub in\n\
6397Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006398interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399
6400static PyObject *
6401unicode_count(PyUnicodeObject *self, PyObject *args)
6402{
6403 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006404 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006405 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 PyObject *result;
6407
Guido van Rossumb8872e62000-05-09 14:14:27 +00006408 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6409 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 return NULL;
6411
6412 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006413 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 if (substring == NULL)
6415 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006416
Thomas Wouters477c8d52006-05-27 19:21:47 +00006417 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418
Thomas Wouters477c8d52006-05-27 19:21:47 +00006419 result = PyInt_FromSsize_t(
6420 stringlib_count(self->str + start, end - start,
6421 substring->str, substring->length)
6422 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
6424 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006425
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 return result;
6427}
6428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006429PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006430"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006432Encodes S using the codec registered for encoding. encoding defaults\n\
6433to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006434handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006435a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6436'xmlcharrefreplace' as well as any other name registered with\n\
6437codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438
6439static PyObject *
6440unicode_encode(PyUnicodeObject *self, PyObject *args)
6441{
6442 char *encoding = NULL;
6443 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006444 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006445
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6447 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006448 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006449 if (v == NULL)
6450 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006451 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006452 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006453 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006454 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006455 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006456 Py_DECREF(v);
6457 return NULL;
6458 }
6459 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006460
6461 onError:
6462 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006463}
6464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006465PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466"S.expandtabs([tabsize]) -> unicode\n\
6467\n\
6468Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006469If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470
6471static PyObject*
6472unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6473{
6474 Py_UNICODE *e;
6475 Py_UNICODE *p;
6476 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006477 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 PyUnicodeObject *u;
6479 int tabsize = 8;
6480
6481 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6482 return NULL;
6483
Thomas Wouters7e474022000-07-16 12:04:32 +00006484 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006485 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 e = self->str + self->length;
6487 for (p = self->str; p < e; p++)
6488 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006489 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006491 if (old_j > j) {
6492 PyErr_SetString(PyExc_OverflowError,
6493 "new string is too long");
6494 return NULL;
6495 }
6496 old_j = j;
6497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 }
6499 else {
6500 j++;
6501 if (*p == '\n' || *p == '\r') {
6502 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006503 old_j = j = 0;
6504 if (i < 0) {
6505 PyErr_SetString(PyExc_OverflowError,
6506 "new string is too long");
6507 return NULL;
6508 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 }
6510 }
6511
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006512 if ((i + j) < 0) {
6513 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6514 return NULL;
6515 }
6516
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 /* Second pass: create output string and fill it */
6518 u = _PyUnicode_New(i + j);
6519 if (!u)
6520 return NULL;
6521
6522 j = 0;
6523 q = u->str;
6524
6525 for (p = self->str; p < e; p++)
6526 if (*p == '\t') {
6527 if (tabsize > 0) {
6528 i = tabsize - (j % tabsize);
6529 j += i;
6530 while (i--)
6531 *q++ = ' ';
6532 }
6533 }
6534 else {
6535 j++;
6536 *q++ = *p;
6537 if (*p == '\n' || *p == '\r')
6538 j = 0;
6539 }
6540
6541 return (PyObject*) u;
6542}
6543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006544PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545"S.find(sub [,start [,end]]) -> int\n\
6546\n\
6547Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006548such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549arguments start and end are interpreted as in slice notation.\n\
6550\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006551Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
6553static PyObject *
6554unicode_find(PyUnicodeObject *self, PyObject *args)
6555{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006556 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006557 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006558 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006559 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560
Guido van Rossumb8872e62000-05-09 14:14:27 +00006561 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6562 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006564 substring = PyUnicode_FromObject(substring);
6565 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 return NULL;
6567
Thomas Wouters477c8d52006-05-27 19:21:47 +00006568 result = stringlib_find_slice(
6569 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6570 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6571 start, end
6572 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573
6574 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575
6576 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577}
6578
6579static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006580unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581{
6582 if (index < 0 || index >= self->length) {
6583 PyErr_SetString(PyExc_IndexError, "string index out of range");
6584 return NULL;
6585 }
6586
6587 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6588}
6589
Guido van Rossumc2504932007-09-18 19:42:40 +00006590/* Believe it or not, this produces the same value for ASCII strings
6591 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006593unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594{
Guido van Rossumc2504932007-09-18 19:42:40 +00006595 Py_ssize_t len;
6596 Py_UNICODE *p;
6597 long x;
6598
6599 if (self->hash != -1)
6600 return self->hash;
6601 len = Py_Size(self);
6602 p = self->str;
6603 x = *p << 7;
6604 while (--len >= 0)
6605 x = (1000003*x) ^ *p++;
6606 x ^= Py_Size(self);
6607 if (x == -1)
6608 x = -2;
6609 self->hash = x;
6610 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611}
6612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006613PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614"S.index(sub [,start [,end]]) -> int\n\
6615\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006616Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617
6618static PyObject *
6619unicode_index(PyUnicodeObject *self, PyObject *args)
6620{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006621 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006622 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006623 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006624 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625
Guido van Rossumb8872e62000-05-09 14:14:27 +00006626 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6627 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006629 substring = PyUnicode_FromObject(substring);
6630 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 return NULL;
6632
Thomas Wouters477c8d52006-05-27 19:21:47 +00006633 result = stringlib_find_slice(
6634 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6635 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6636 start, end
6637 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638
6639 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006640
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 if (result < 0) {
6642 PyErr_SetString(PyExc_ValueError, "substring not found");
6643 return NULL;
6644 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006645
Martin v. Löwis18e16552006-02-15 17:27:45 +00006646 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006649PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006650"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006652Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006653at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654
6655static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006656unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657{
6658 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6659 register const Py_UNICODE *e;
6660 int cased;
6661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 /* Shortcut for single character strings */
6663 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006664 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006666 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006667 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006668 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006669
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 e = p + PyUnicode_GET_SIZE(self);
6671 cased = 0;
6672 for (; p < e; p++) {
6673 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006674
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006676 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 else if (!cased && Py_UNICODE_ISLOWER(ch))
6678 cased = 1;
6679 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006680 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006683PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006684"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006686Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006687at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006690unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691{
6692 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6693 register const Py_UNICODE *e;
6694 int cased;
6695
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 /* Shortcut for single character strings */
6697 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006698 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006700 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006701 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006702 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 e = p + PyUnicode_GET_SIZE(self);
6705 cased = 0;
6706 for (; p < e; p++) {
6707 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006708
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006710 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 else if (!cased && Py_UNICODE_ISUPPER(ch))
6712 cased = 1;
6713 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006714 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715}
6716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006717PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006718"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006720Return True if S is a titlecased string and there is at least one\n\
6721character in S, i.e. upper- and titlecase characters may only\n\
6722follow uncased characters and lowercase characters only cased ones.\n\
6723Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
6725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006726unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
6728 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6729 register const Py_UNICODE *e;
6730 int cased, previous_is_cased;
6731
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 /* Shortcut for single character strings */
6733 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006734 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6735 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006737 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006738 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006739 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006740
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 e = p + PyUnicode_GET_SIZE(self);
6742 cased = 0;
6743 previous_is_cased = 0;
6744 for (; p < e; p++) {
6745 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006746
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6748 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006749 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 previous_is_cased = 1;
6751 cased = 1;
6752 }
6753 else if (Py_UNICODE_ISLOWER(ch)) {
6754 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006755 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 previous_is_cased = 1;
6757 cased = 1;
6758 }
6759 else
6760 previous_is_cased = 0;
6761 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006762 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763}
6764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006765PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006766"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006768Return True if all characters in S are whitespace\n\
6769and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770
6771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006772unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773{
6774 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6775 register const Py_UNICODE *e;
6776
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 /* Shortcut for single character strings */
6778 if (PyUnicode_GET_SIZE(self) == 1 &&
6779 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006780 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006782 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006783 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006784 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 e = p + PyUnicode_GET_SIZE(self);
6787 for (; p < e; p++) {
6788 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006789 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006791 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792}
6793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006794PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006796\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006797Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006799
6800static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006801unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006802{
6803 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6804 register const Py_UNICODE *e;
6805
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006806 /* Shortcut for single character strings */
6807 if (PyUnicode_GET_SIZE(self) == 1 &&
6808 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006809 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006810
6811 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006812 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006813 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006814
6815 e = p + PyUnicode_GET_SIZE(self);
6816 for (; p < e; p++) {
6817 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006818 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006819 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006820 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006821}
6822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006823PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006824"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006825\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006826Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006828
6829static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006830unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006831{
6832 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6833 register const Py_UNICODE *e;
6834
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006835 /* Shortcut for single character strings */
6836 if (PyUnicode_GET_SIZE(self) == 1 &&
6837 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006838 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006839
6840 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006841 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006843
6844 e = p + PyUnicode_GET_SIZE(self);
6845 for (; p < e; p++) {
6846 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006848 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006849 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006850}
6851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006852PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006853"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006855Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006856False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857
6858static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006859unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860{
6861 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6862 register const Py_UNICODE *e;
6863
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 /* Shortcut for single character strings */
6865 if (PyUnicode_GET_SIZE(self) == 1 &&
6866 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006867 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006869 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006870 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006871 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006872
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 e = p + PyUnicode_GET_SIZE(self);
6874 for (; p < e; p++) {
6875 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879}
6880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006881PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006884Return True if all characters in S are digits\n\
6885and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886
6887static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006888unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889{
6890 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6891 register const Py_UNICODE *e;
6892
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 /* Shortcut for single character strings */
6894 if (PyUnicode_GET_SIZE(self) == 1 &&
6895 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006896 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006898 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006899 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006901
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 e = p + PyUnicode_GET_SIZE(self);
6903 for (; p < e; p++) {
6904 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006907 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908}
6909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006911"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006913Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006914False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
6916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006917unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918{
6919 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6920 register const Py_UNICODE *e;
6921
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 /* Shortcut for single character strings */
6923 if (PyUnicode_GET_SIZE(self) == 1 &&
6924 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006925 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006927 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006928 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006929 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006930
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 e = p + PyUnicode_GET_SIZE(self);
6932 for (; p < e; p++) {
6933 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937}
6938
Martin v. Löwis47383402007-08-15 07:32:56 +00006939int
6940PyUnicode_IsIdentifier(PyObject *self)
6941{
6942 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6943 register const Py_UNICODE *e;
6944
6945 /* Special case for empty strings */
6946 if (PyUnicode_GET_SIZE(self) == 0)
6947 return 0;
6948
6949 /* PEP 3131 says that the first character must be in
6950 XID_Start and subsequent characters in XID_Continue,
6951 and for the ASCII range, the 2.x rules apply (i.e
6952 start with letters and underscore, continue with
6953 letters, digits, underscore). However, given the current
6954 definition of XID_Start and XID_Continue, it is sufficient
6955 to check just for these, except that _ must be allowed
6956 as starting an identifier. */
6957 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6958 return 0;
6959
6960 e = p + PyUnicode_GET_SIZE(self);
6961 for (p++; p < e; p++) {
6962 if (!_PyUnicode_IsXidContinue(*p))
6963 return 0;
6964 }
6965 return 1;
6966}
6967
6968PyDoc_STRVAR(isidentifier__doc__,
6969"S.isidentifier() -> bool\n\
6970\n\
6971Return True if S is a valid identifier according\n\
6972to the language definition.");
6973
6974static PyObject*
6975unicode_isidentifier(PyObject *self)
6976{
6977 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6978}
6979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981"S.join(sequence) -> unicode\n\
6982\n\
6983Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006984sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985
6986static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006987unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006989 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990}
6991
Martin v. Löwis18e16552006-02-15 17:27:45 +00006992static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993unicode_length(PyUnicodeObject *self)
6994{
6995 return self->length;
6996}
6997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006998PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006999"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000\n\
7001Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007002done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003
7004static PyObject *
7005unicode_ljust(PyUnicodeObject *self, PyObject *args)
7006{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007007 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007008 Py_UNICODE fillchar = ' ';
7009
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007010 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 return NULL;
7012
Tim Peters7a29bd52001-09-12 03:03:31 +00007013 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 Py_INCREF(self);
7015 return (PyObject*) self;
7016 }
7017
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007018 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019}
7020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007021PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022"S.lower() -> unicode\n\
7023\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007024Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
7026static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007027unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 return fixup(self, fixlower);
7030}
7031
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007032#define LEFTSTRIP 0
7033#define RIGHTSTRIP 1
7034#define BOTHSTRIP 2
7035
7036/* Arrays indexed by above */
7037static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7038
7039#define STRIPNAME(i) (stripformat[i]+3)
7040
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007041/* externally visible for str.strip(unicode) */
7042PyObject *
7043_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7044{
7045 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007046 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007047 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007048 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7049 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007050
Thomas Wouters477c8d52006-05-27 19:21:47 +00007051 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7052
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007053 i = 0;
7054 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7056 i++;
7057 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058 }
7059
7060 j = len;
7061 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007062 do {
7063 j--;
7064 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7065 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007066 }
7067
7068 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007069 Py_INCREF(self);
7070 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007071 }
7072 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007073 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007074}
7075
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076
7077static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007078do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007080 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007081 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007082
7083 i = 0;
7084 if (striptype != RIGHTSTRIP) {
7085 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7086 i++;
7087 }
7088 }
7089
7090 j = len;
7091 if (striptype != LEFTSTRIP) {
7092 do {
7093 j--;
7094 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7095 j++;
7096 }
7097
7098 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7099 Py_INCREF(self);
7100 return (PyObject*)self;
7101 }
7102 else
7103 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104}
7105
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106
7107static PyObject *
7108do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7109{
7110 PyObject *sep = NULL;
7111
7112 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7113 return NULL;
7114
7115 if (sep != NULL && sep != Py_None) {
7116 if (PyUnicode_Check(sep))
7117 return _PyUnicode_XStrip(self, striptype, sep);
7118 else if (PyString_Check(sep)) {
7119 PyObject *res;
7120 sep = PyUnicode_FromObject(sep);
7121 if (sep==NULL)
7122 return NULL;
7123 res = _PyUnicode_XStrip(self, striptype, sep);
7124 Py_DECREF(sep);
7125 return res;
7126 }
7127 else {
7128 PyErr_Format(PyExc_TypeError,
7129 "%s arg must be None, unicode or str",
7130 STRIPNAME(striptype));
7131 return NULL;
7132 }
7133 }
7134
7135 return do_strip(self, striptype);
7136}
7137
7138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007139PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007140"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007141\n\
7142Return a copy of the string S with leading and trailing\n\
7143whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007144If chars is given and not None, remove characters in chars instead.\n\
7145If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007146
7147static PyObject *
7148unicode_strip(PyUnicodeObject *self, PyObject *args)
7149{
7150 if (PyTuple_GET_SIZE(args) == 0)
7151 return do_strip(self, BOTHSTRIP); /* Common case */
7152 else
7153 return do_argstrip(self, BOTHSTRIP, args);
7154}
7155
7156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007157PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007158"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007159\n\
7160Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007161If chars is given and not None, remove characters in chars instead.\n\
7162If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007163
7164static PyObject *
7165unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7166{
7167 if (PyTuple_GET_SIZE(args) == 0)
7168 return do_strip(self, LEFTSTRIP); /* Common case */
7169 else
7170 return do_argstrip(self, LEFTSTRIP, args);
7171}
7172
7173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007174PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007175"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007176\n\
7177Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007178If chars is given and not None, remove characters in chars instead.\n\
7179If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007180
7181static PyObject *
7182unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7183{
7184 if (PyTuple_GET_SIZE(args) == 0)
7185 return do_strip(self, RIGHTSTRIP); /* Common case */
7186 else
7187 return do_argstrip(self, RIGHTSTRIP, args);
7188}
7189
7190
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193{
7194 PyUnicodeObject *u;
7195 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007196 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007197 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198
7199 if (len < 0)
7200 len = 0;
7201
Tim Peters7a29bd52001-09-12 03:03:31 +00007202 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 /* no repeat, return original string */
7204 Py_INCREF(str);
7205 return (PyObject*) str;
7206 }
Tim Peters8f422462000-09-09 06:13:41 +00007207
7208 /* ensure # of chars needed doesn't overflow int and # of bytes
7209 * needed doesn't overflow size_t
7210 */
7211 nchars = len * str->length;
7212 if (len && nchars / len != str->length) {
7213 PyErr_SetString(PyExc_OverflowError,
7214 "repeated string is too long");
7215 return NULL;
7216 }
7217 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7218 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7219 PyErr_SetString(PyExc_OverflowError,
7220 "repeated string is too long");
7221 return NULL;
7222 }
7223 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 if (!u)
7225 return NULL;
7226
7227 p = u->str;
7228
Thomas Wouters477c8d52006-05-27 19:21:47 +00007229 if (str->length == 1 && len > 0) {
7230 Py_UNICODE_FILL(p, str->str[0], len);
7231 } else {
7232 Py_ssize_t done = 0; /* number of characters copied this far */
7233 if (done < nchars) {
7234 Py_UNICODE_COPY(p, str->str, str->length);
7235 done = str->length;
7236 }
7237 while (done < nchars) {
7238 int n = (done <= nchars-done) ? done : nchars-done;
7239 Py_UNICODE_COPY(p+done, p, n);
7240 done += n;
7241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 }
7243
7244 return (PyObject*) u;
7245}
7246
7247PyObject *PyUnicode_Replace(PyObject *obj,
7248 PyObject *subobj,
7249 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007250 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251{
7252 PyObject *self;
7253 PyObject *str1;
7254 PyObject *str2;
7255 PyObject *result;
7256
7257 self = PyUnicode_FromObject(obj);
7258 if (self == NULL)
7259 return NULL;
7260 str1 = PyUnicode_FromObject(subobj);
7261 if (str1 == NULL) {
7262 Py_DECREF(self);
7263 return NULL;
7264 }
7265 str2 = PyUnicode_FromObject(replobj);
7266 if (str2 == NULL) {
7267 Py_DECREF(self);
7268 Py_DECREF(str1);
7269 return NULL;
7270 }
Tim Petersced69f82003-09-16 20:30:58 +00007271 result = replace((PyUnicodeObject *)self,
7272 (PyUnicodeObject *)str1,
7273 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 maxcount);
7275 Py_DECREF(self);
7276 Py_DECREF(str1);
7277 Py_DECREF(str2);
7278 return result;
7279}
7280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007281PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282"S.replace (old, new[, maxsplit]) -> unicode\n\
7283\n\
7284Return a copy of S with all occurrences of substring\n\
7285old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007286given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287
7288static PyObject*
7289unicode_replace(PyUnicodeObject *self, PyObject *args)
7290{
7291 PyUnicodeObject *str1;
7292 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007293 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 PyObject *result;
7295
Martin v. Löwis18e16552006-02-15 17:27:45 +00007296 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297 return NULL;
7298 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7299 if (str1 == NULL)
7300 return NULL;
7301 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007302 if (str2 == NULL) {
7303 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
7307 result = replace(self, str1, str2, maxcount);
7308
7309 Py_DECREF(str1);
7310 Py_DECREF(str2);
7311 return result;
7312}
7313
7314static
7315PyObject *unicode_repr(PyObject *unicode)
7316{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007317 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007318 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007319 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7320 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7321
7322 /* XXX(nnorwitz): rather than over-allocating, it would be
7323 better to choose a different scheme. Perhaps scan the
7324 first N-chars of the string and allocate based on that size.
7325 */
7326 /* Initial allocation is based on the longest-possible unichr
7327 escape.
7328
7329 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7330 unichr, so in this case it's the longest unichr escape. In
7331 narrow (UTF-16) builds this is five chars per source unichr
7332 since there are two unichrs in the surrogate pair, so in narrow
7333 (UTF-16) builds it's not the longest unichr escape.
7334
7335 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7336 so in the narrow (UTF-16) build case it's the longest unichr
7337 escape.
7338 */
7339
Walter Dörwald1ab83302007-05-18 17:15:44 +00007340 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007341 2 /* quotes */
7342#ifdef Py_UNICODE_WIDE
7343 + 10*size
7344#else
7345 + 6*size
7346#endif
7347 + 1);
7348 if (repr == NULL)
7349 return NULL;
7350
Walter Dörwald1ab83302007-05-18 17:15:44 +00007351 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007352
7353 /* Add quote */
7354 *p++ = (findchar(s, size, '\'') &&
7355 !findchar(s, size, '"')) ? '"' : '\'';
7356 while (size-- > 0) {
7357 Py_UNICODE ch = *s++;
7358
7359 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007360 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007361 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007362 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007363 continue;
7364 }
7365
7366#ifdef Py_UNICODE_WIDE
7367 /* Map 21-bit characters to '\U00xxxxxx' */
7368 else if (ch >= 0x10000) {
7369 *p++ = '\\';
7370 *p++ = 'U';
7371 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7372 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7373 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7374 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7375 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7376 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7377 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7378 *p++ = hexdigits[ch & 0x0000000F];
7379 continue;
7380 }
7381#else
7382 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7383 else if (ch >= 0xD800 && ch < 0xDC00) {
7384 Py_UNICODE ch2;
7385 Py_UCS4 ucs;
7386
7387 ch2 = *s++;
7388 size--;
7389 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7390 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7391 *p++ = '\\';
7392 *p++ = 'U';
7393 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7394 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7395 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7396 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7397 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7398 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7399 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7400 *p++ = hexdigits[ucs & 0x0000000F];
7401 continue;
7402 }
7403 /* Fall through: isolated surrogates are copied as-is */
7404 s--;
7405 size++;
7406 }
7407#endif
7408
7409 /* Map 16-bit characters to '\uxxxx' */
7410 if (ch >= 256) {
7411 *p++ = '\\';
7412 *p++ = 'u';
7413 *p++ = hexdigits[(ch >> 12) & 0x000F];
7414 *p++ = hexdigits[(ch >> 8) & 0x000F];
7415 *p++ = hexdigits[(ch >> 4) & 0x000F];
7416 *p++ = hexdigits[ch & 0x000F];
7417 }
7418
7419 /* Map special whitespace to '\t', \n', '\r' */
7420 else if (ch == '\t') {
7421 *p++ = '\\';
7422 *p++ = 't';
7423 }
7424 else if (ch == '\n') {
7425 *p++ = '\\';
7426 *p++ = 'n';
7427 }
7428 else if (ch == '\r') {
7429 *p++ = '\\';
7430 *p++ = 'r';
7431 }
7432
7433 /* Map non-printable US ASCII to '\xhh' */
7434 else if (ch < ' ' || ch >= 0x7F) {
7435 *p++ = '\\';
7436 *p++ = 'x';
7437 *p++ = hexdigits[(ch >> 4) & 0x000F];
7438 *p++ = hexdigits[ch & 0x000F];
7439 }
7440
7441 /* Copy everything else as-is */
7442 else
7443 *p++ = (char) ch;
7444 }
7445 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007446 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007447
7448 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007449 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007450 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451}
7452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007453PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454"S.rfind(sub [,start [,end]]) -> int\n\
7455\n\
7456Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007457such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458arguments start and end are interpreted as in slice notation.\n\
7459\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007460Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461
7462static PyObject *
7463unicode_rfind(PyUnicodeObject *self, PyObject *args)
7464{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007465 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007466 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007467 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007468 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469
Guido van Rossumb8872e62000-05-09 14:14:27 +00007470 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7471 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007473 substring = PyUnicode_FromObject(substring);
7474 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 return NULL;
7476
Thomas Wouters477c8d52006-05-27 19:21:47 +00007477 result = stringlib_rfind_slice(
7478 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7479 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7480 start, end
7481 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
7483 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007484
7485 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486}
7487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007488PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489"S.rindex(sub [,start [,end]]) -> int\n\
7490\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007491Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492
7493static PyObject *
7494unicode_rindex(PyUnicodeObject *self, PyObject *args)
7495{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007496 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007497 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007498 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007499 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500
Guido van Rossumb8872e62000-05-09 14:14:27 +00007501 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7502 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007504 substring = PyUnicode_FromObject(substring);
7505 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 return NULL;
7507
Thomas Wouters477c8d52006-05-27 19:21:47 +00007508 result = stringlib_rfind_slice(
7509 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7510 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7511 start, end
7512 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513
7514 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007515
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 if (result < 0) {
7517 PyErr_SetString(PyExc_ValueError, "substring not found");
7518 return NULL;
7519 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007520 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521}
7522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007523PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007524"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525\n\
7526Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007527done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
7529static PyObject *
7530unicode_rjust(PyUnicodeObject *self, PyObject *args)
7531{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007532 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007533 Py_UNICODE fillchar = ' ';
7534
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007535 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 return NULL;
7537
Tim Peters7a29bd52001-09-12 03:03:31 +00007538 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 Py_INCREF(self);
7540 return (PyObject*) self;
7541 }
7542
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007543 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544}
7545
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546PyObject *PyUnicode_Split(PyObject *s,
7547 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007548 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549{
7550 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007551
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 s = PyUnicode_FromObject(s);
7553 if (s == NULL)
7554 return NULL;
7555 if (sep != NULL) {
7556 sep = PyUnicode_FromObject(sep);
7557 if (sep == NULL) {
7558 Py_DECREF(s);
7559 return NULL;
7560 }
7561 }
7562
7563 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7564
7565 Py_DECREF(s);
7566 Py_XDECREF(sep);
7567 return result;
7568}
7569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007570PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571"S.split([sep [,maxsplit]]) -> list of strings\n\
7572\n\
7573Return a list of the words in S, using sep as the\n\
7574delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007575splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007576any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578static PyObject*
7579unicode_split(PyUnicodeObject *self, PyObject *args)
7580{
7581 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007582 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583
Martin v. Löwis18e16552006-02-15 17:27:45 +00007584 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 return NULL;
7586
7587 if (substring == Py_None)
7588 return split(self, NULL, maxcount);
7589 else if (PyUnicode_Check(substring))
7590 return split(self, (PyUnicodeObject *)substring, maxcount);
7591 else
7592 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7593}
7594
Thomas Wouters477c8d52006-05-27 19:21:47 +00007595PyObject *
7596PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7597{
7598 PyObject* str_obj;
7599 PyObject* sep_obj;
7600 PyObject* out;
7601
7602 str_obj = PyUnicode_FromObject(str_in);
7603 if (!str_obj)
7604 return NULL;
7605 sep_obj = PyUnicode_FromObject(sep_in);
7606 if (!sep_obj) {
7607 Py_DECREF(str_obj);
7608 return NULL;
7609 }
7610
7611 out = stringlib_partition(
7612 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7613 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7614 );
7615
7616 Py_DECREF(sep_obj);
7617 Py_DECREF(str_obj);
7618
7619 return out;
7620}
7621
7622
7623PyObject *
7624PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7625{
7626 PyObject* str_obj;
7627 PyObject* sep_obj;
7628 PyObject* out;
7629
7630 str_obj = PyUnicode_FromObject(str_in);
7631 if (!str_obj)
7632 return NULL;
7633 sep_obj = PyUnicode_FromObject(sep_in);
7634 if (!sep_obj) {
7635 Py_DECREF(str_obj);
7636 return NULL;
7637 }
7638
7639 out = stringlib_rpartition(
7640 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7641 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7642 );
7643
7644 Py_DECREF(sep_obj);
7645 Py_DECREF(str_obj);
7646
7647 return out;
7648}
7649
7650PyDoc_STRVAR(partition__doc__,
7651"S.partition(sep) -> (head, sep, tail)\n\
7652\n\
7653Searches for the separator sep in S, and returns the part before it,\n\
7654the separator itself, and the part after it. If the separator is not\n\
7655found, returns S and two empty strings.");
7656
7657static PyObject*
7658unicode_partition(PyUnicodeObject *self, PyObject *separator)
7659{
7660 return PyUnicode_Partition((PyObject *)self, separator);
7661}
7662
7663PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007664"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007665\n\
7666Searches for the separator sep in S, starting at the end of S, and returns\n\
7667the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007668separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007669
7670static PyObject*
7671unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7672{
7673 return PyUnicode_RPartition((PyObject *)self, separator);
7674}
7675
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007676PyObject *PyUnicode_RSplit(PyObject *s,
7677 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007678 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007679{
7680 PyObject *result;
7681
7682 s = PyUnicode_FromObject(s);
7683 if (s == NULL)
7684 return NULL;
7685 if (sep != NULL) {
7686 sep = PyUnicode_FromObject(sep);
7687 if (sep == NULL) {
7688 Py_DECREF(s);
7689 return NULL;
7690 }
7691 }
7692
7693 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7694
7695 Py_DECREF(s);
7696 Py_XDECREF(sep);
7697 return result;
7698}
7699
7700PyDoc_STRVAR(rsplit__doc__,
7701"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7702\n\
7703Return a list of the words in S, using sep as the\n\
7704delimiter string, starting at the end of the string and\n\
7705working to the front. If maxsplit is given, at most maxsplit\n\
7706splits are done. If sep is not specified, any whitespace string\n\
7707is a separator.");
7708
7709static PyObject*
7710unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7711{
7712 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007713 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007714
Martin v. Löwis18e16552006-02-15 17:27:45 +00007715 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007716 return NULL;
7717
7718 if (substring == Py_None)
7719 return rsplit(self, NULL, maxcount);
7720 else if (PyUnicode_Check(substring))
7721 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7722 else
7723 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7724}
7725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007726PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007727"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728\n\
7729Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007730Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007731is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732
7733static PyObject*
7734unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7735{
Guido van Rossum86662912000-04-11 15:38:46 +00007736 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
Guido van Rossum86662912000-04-11 15:38:46 +00007738 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 return NULL;
7740
Guido van Rossum86662912000-04-11 15:38:46 +00007741 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742}
7743
7744static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007745PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746{
Walter Dörwald346737f2007-05-31 10:44:43 +00007747 if (PyUnicode_CheckExact(self)) {
7748 Py_INCREF(self);
7749 return self;
7750 } else
7751 /* Subtype -- return genuine unicode string with the same value. */
7752 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7753 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754}
7755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007756PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757"S.swapcase() -> unicode\n\
7758\n\
7759Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761
7762static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007763unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 return fixup(self, fixswapcase);
7766}
7767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007768PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769"S.translate(table) -> unicode\n\
7770\n\
7771Return a copy of the string S, where all characters have been mapped\n\
7772through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007773Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7774Unmapped characters are left untouched. Characters mapped to None\n\
7775are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
7777static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007778unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779{
Tim Petersced69f82003-09-16 20:30:58 +00007780 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007782 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 "ignore");
7784}
7785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007786PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787"S.upper() -> unicode\n\
7788\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007789Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790
7791static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007792unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794 return fixup(self, fixupper);
7795}
7796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007797PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798"S.zfill(width) -> unicode\n\
7799\n\
7800Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007801of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802
7803static PyObject *
7804unicode_zfill(PyUnicodeObject *self, PyObject *args)
7805{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007806 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 PyUnicodeObject *u;
7808
Martin v. Löwis18e16552006-02-15 17:27:45 +00007809 Py_ssize_t width;
7810 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 return NULL;
7812
7813 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007814 if (PyUnicode_CheckExact(self)) {
7815 Py_INCREF(self);
7816 return (PyObject*) self;
7817 }
7818 else
7819 return PyUnicode_FromUnicode(
7820 PyUnicode_AS_UNICODE(self),
7821 PyUnicode_GET_SIZE(self)
7822 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 }
7824
7825 fill = width - self->length;
7826
7827 u = pad(self, fill, 0, '0');
7828
Walter Dörwald068325e2002-04-15 13:36:47 +00007829 if (u == NULL)
7830 return NULL;
7831
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 if (u->str[fill] == '+' || u->str[fill] == '-') {
7833 /* move sign to beginning of string */
7834 u->str[0] = u->str[fill];
7835 u->str[fill] = '0';
7836 }
7837
7838 return (PyObject*) u;
7839}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840
7841#if 0
7842static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007843unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 return PyInt_FromLong(unicode_freelist_size);
7846}
7847#endif
7848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007849PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007850"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007852Return True if S starts with the specified prefix, False otherwise.\n\
7853With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007854With optional end, stop comparing S at that position.\n\
7855prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856
7857static PyObject *
7858unicode_startswith(PyUnicodeObject *self,
7859 PyObject *args)
7860{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007861 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007863 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007864 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007865 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007867 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007868 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007870 if (PyTuple_Check(subobj)) {
7871 Py_ssize_t i;
7872 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7873 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7874 PyTuple_GET_ITEM(subobj, i));
7875 if (substring == NULL)
7876 return NULL;
7877 result = tailmatch(self, substring, start, end, -1);
7878 Py_DECREF(substring);
7879 if (result) {
7880 Py_RETURN_TRUE;
7881 }
7882 }
7883 /* nothing matched */
7884 Py_RETURN_FALSE;
7885 }
7886 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007888 return NULL;
7889 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007891 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892}
7893
7894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007895PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007896"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007898Return True if S ends with the specified suffix, False otherwise.\n\
7899With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007900With optional end, stop comparing S at that position.\n\
7901suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902
7903static PyObject *
7904unicode_endswith(PyUnicodeObject *self,
7905 PyObject *args)
7906{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007907 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007909 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007910 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007911 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007913 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7914 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007916 if (PyTuple_Check(subobj)) {
7917 Py_ssize_t i;
7918 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7919 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7920 PyTuple_GET_ITEM(subobj, i));
7921 if (substring == NULL)
7922 return NULL;
7923 result = tailmatch(self, substring, start, end, +1);
7924 Py_DECREF(substring);
7925 if (result) {
7926 Py_RETURN_TRUE;
7927 }
7928 }
7929 Py_RETURN_FALSE;
7930 }
7931 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007934
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007935 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007937 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938}
7939
Eric Smith8c663262007-08-25 02:26:07 +00007940#include "stringlib/string_format.h"
7941
7942PyDoc_STRVAR(format__doc__,
7943"S.format(*args, **kwargs) -> unicode\n\
7944\n\
7945");
7946
Eric Smith8c663262007-08-25 02:26:07 +00007947PyDoc_STRVAR(p_format__doc__,
7948"S.__format__(format_spec) -> unicode\n\
7949\n\
7950");
7951
7952static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007953unicode_getnewargs(PyUnicodeObject *v)
7954{
7955 return Py_BuildValue("(u#)", v->str, v->length);
7956}
7957
7958
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959static PyMethodDef unicode_methods[] = {
7960
7961 /* Order is according to common usage: often used methods should
7962 appear first, since lookup is done sequentially. */
7963
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007964 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7965 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7966 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007967 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007968 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7969 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7970 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7971 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7972 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7973 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7974 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007975 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007976 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7977 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7978 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007979 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007980 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7981 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7982 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007983 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007984 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007985 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007986 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007987 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7988 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7989 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7990 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7991 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7992 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7993 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7994 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7995 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7996 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7997 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7998 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7999 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8000 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008001 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008002 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008003 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8004 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008005 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8006 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008007#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008008 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009#endif
8010
8011#if 0
8012 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008013 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014#endif
8015
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008016 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 {NULL, NULL}
8018};
8019
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008020static PyObject *
8021unicode_mod(PyObject *v, PyObject *w)
8022{
8023 if (!PyUnicode_Check(v)) {
8024 Py_INCREF(Py_NotImplemented);
8025 return Py_NotImplemented;
8026 }
8027 return PyUnicode_Format(v, w);
8028}
8029
8030static PyNumberMethods unicode_as_number = {
8031 0, /*nb_add*/
8032 0, /*nb_subtract*/
8033 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008034 unicode_mod, /*nb_remainder*/
8035};
8036
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008038 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008039 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008040 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8041 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008042 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 0, /* sq_ass_item */
8044 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008045 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046};
8047
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008048static PyObject*
8049unicode_subscript(PyUnicodeObject* self, PyObject* item)
8050{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008051 if (PyIndex_Check(item)) {
8052 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008053 if (i == -1 && PyErr_Occurred())
8054 return NULL;
8055 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008056 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008057 return unicode_getitem(self, i);
8058 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008059 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008060 Py_UNICODE* source_buf;
8061 Py_UNICODE* result_buf;
8062 PyObject* result;
8063
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008064 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008065 &start, &stop, &step, &slicelength) < 0) {
8066 return NULL;
8067 }
8068
8069 if (slicelength <= 0) {
8070 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008071 } else if (start == 0 && step == 1 && slicelength == self->length &&
8072 PyUnicode_CheckExact(self)) {
8073 Py_INCREF(self);
8074 return (PyObject *)self;
8075 } else if (step == 1) {
8076 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008077 } else {
8078 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008079 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8080 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008081
8082 if (result_buf == NULL)
8083 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008084
8085 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8086 result_buf[i] = source_buf[cur];
8087 }
Tim Petersced69f82003-09-16 20:30:58 +00008088
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008089 result = PyUnicode_FromUnicode(result_buf, slicelength);
8090 PyMem_FREE(result_buf);
8091 return result;
8092 }
8093 } else {
8094 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8095 return NULL;
8096 }
8097}
8098
8099static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008101 (binaryfunc)unicode_subscript, /* mp_subscript */
8102 (objobjargproc)0, /* mp_ass_subscript */
8103};
8104
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105
8106static int
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008107unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008110 if (flags & PyBUF_CHARACTER) {
Guido van Rossuma74184e2007-08-29 04:05:57 +00008111 PyErr_SetString(PyExc_SystemError, "can't use str as char buffer");
8112 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 }
Guido van Rossuma74184e2007-08-29 04:05:57 +00008114 return PyBuffer_FillInfo(view, (void *)self->str,
8115 PyUnicode_GET_DATA_SIZE(self), 1, flags);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116}
8117
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008118
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119/* Helpers for PyUnicode_Format() */
8120
8121static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008122getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008124 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125 if (argidx < arglen) {
8126 (*p_argidx)++;
8127 if (arglen < 0)
8128 return args;
8129 else
8130 return PyTuple_GetItem(args, argidx);
8131 }
8132 PyErr_SetString(PyExc_TypeError,
8133 "not enough arguments for format string");
8134 return NULL;
8135}
8136
8137#define F_LJUST (1<<0)
8138#define F_SIGN (1<<1)
8139#define F_BLANK (1<<2)
8140#define F_ALT (1<<3)
8141#define F_ZERO (1<<4)
8142
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008144strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008146 register Py_ssize_t i;
8147 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 for (i = len - 1; i >= 0; i--)
8149 buffer[i] = (Py_UNICODE) charbuffer[i];
8150
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 return len;
8152}
8153
Neal Norwitzfc76d632006-01-10 06:03:13 +00008154static int
8155doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8156{
Tim Peters15231542006-02-16 01:08:01 +00008157 Py_ssize_t result;
8158
Neal Norwitzfc76d632006-01-10 06:03:13 +00008159 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008160 result = strtounicode(buffer, (char *)buffer);
8161 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008162}
8163
8164static int
8165longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8166{
Tim Peters15231542006-02-16 01:08:01 +00008167 Py_ssize_t result;
8168
Neal Norwitzfc76d632006-01-10 06:03:13 +00008169 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008170 result = strtounicode(buffer, (char *)buffer);
8171 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008172}
8173
Guido van Rossum078151d2002-08-11 04:24:12 +00008174/* XXX To save some code duplication, formatfloat/long/int could have been
8175 shared with stringobject.c, converting from 8-bit to Unicode after the
8176 formatting is done. */
8177
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178static int
8179formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008180 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 int flags,
8182 int prec,
8183 int type,
8184 PyObject *v)
8185{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008186 /* fmt = '%#.' + `prec` + `type`
8187 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 char fmt[20];
8189 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008190
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 x = PyFloat_AsDouble(v);
8192 if (x == -1.0 && PyErr_Occurred())
8193 return -1;
8194 if (prec < 0)
8195 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8197 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008198 /* Worst case length calc to ensure no buffer overrun:
8199
8200 'g' formats:
8201 fmt = %#.<prec>g
8202 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8203 for any double rep.)
8204 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8205
8206 'f' formats:
8207 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8208 len = 1 + 50 + 1 + prec = 52 + prec
8209
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008210 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008211 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008212
8213 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008214 if (((type == 'g' || type == 'G') &&
8215 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008216 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008217 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008218 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008219 return -1;
8220 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008221 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8222 (flags&F_ALT) ? "#" : "",
8223 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008224 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225}
8226
Tim Peters38fd5b62000-09-21 05:43:11 +00008227static PyObject*
8228formatlong(PyObject *val, int flags, int prec, int type)
8229{
8230 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008231 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008232 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008233 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008234
8235 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8236 if (!str)
8237 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008238 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008239 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008240 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008241}
8242
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243static int
8244formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008245 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 int flags,
8247 int prec,
8248 int type,
8249 PyObject *v)
8250{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008251 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008252 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8253 * + 1 + 1
8254 * = 24
8255 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008256 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008257 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 long x;
8259
8260 x = PyInt_AsLong(v);
8261 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008262 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008263 if (x < 0 && type == 'u') {
8264 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008265 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008266 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8267 sign = "-";
8268 else
8269 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008271 prec = 1;
8272
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008273 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8274 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008275 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008276 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008277 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008278 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008279 return -1;
8280 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008281
8282 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008283 (type == 'x' || type == 'X' || type == 'o')) {
8284 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008285 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008286 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008287 * - when 0 is being converted, the C standard leaves off
8288 * the '0x' or '0X', which is inconsistent with other
8289 * %#x/%#X conversions and inconsistent with Python's
8290 * hex() function
8291 * - there are platforms that violate the standard and
8292 * convert 0 with the '0x' or '0X'
8293 * (Metrowerks, Compaq Tru64)
8294 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008295 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008296 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008297 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008298 * We can achieve the desired consistency by inserting our
8299 * own '0x' or '0X' prefix, and substituting %x/%X in place
8300 * of %#x/%#X.
8301 *
8302 * Note that this is the same approach as used in
8303 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008304 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008305 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8306 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008307 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008308 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008309 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8310 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008311 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008312 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008313 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008314 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008315 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008316 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317}
8318
8319static int
8320formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008321 size_t buflen,
8322 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008324 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008325 if (PyUnicode_Check(v)) {
8326 if (PyUnicode_GET_SIZE(v) != 1)
8327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008331 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008332 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008333 goto onError;
8334 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336
8337 else {
8338 /* Integer input truncated to a character */
8339 long x;
8340 x = PyInt_AsLong(v);
8341 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008342 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008343#ifdef Py_UNICODE_WIDE
8344 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008345 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008346 "%c arg not in range(0x110000) "
8347 "(wide Python build)");
8348 return -1;
8349 }
8350#else
8351 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008352 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008353 "%c arg not in range(0x10000) "
8354 "(narrow Python build)");
8355 return -1;
8356 }
8357#endif
8358 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 }
8360 buf[1] = '\0';
8361 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008362
8363 onError:
8364 PyErr_SetString(PyExc_TypeError,
8365 "%c requires int or char");
8366 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367}
8368
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008369/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8370
8371 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8372 chars are formatted. XXX This is a magic number. Each formatting
8373 routine does bounds checking to ensure no overflow, but a better
8374 solution may be to malloc a buffer of appropriate size for each
8375 format. For now, the current solution is sufficient.
8376*/
8377#define FORMATBUFLEN (size_t)120
8378
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379PyObject *PyUnicode_Format(PyObject *format,
8380 PyObject *args)
8381{
8382 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008383 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384 int args_owned = 0;
8385 PyUnicodeObject *result = NULL;
8386 PyObject *dict = NULL;
8387 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008388
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 if (format == NULL || args == NULL) {
8390 PyErr_BadInternalCall();
8391 return NULL;
8392 }
8393 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008394 if (uformat == NULL)
8395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 fmt = PyUnicode_AS_UNICODE(uformat);
8397 fmtcnt = PyUnicode_GET_SIZE(uformat);
8398
8399 reslen = rescnt = fmtcnt + 100;
8400 result = _PyUnicode_New(reslen);
8401 if (result == NULL)
8402 goto onError;
8403 res = PyUnicode_AS_UNICODE(result);
8404
8405 if (PyTuple_Check(args)) {
8406 arglen = PyTuple_Size(args);
8407 argidx = 0;
8408 }
8409 else {
8410 arglen = -1;
8411 argidx = -2;
8412 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008413 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008414 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415 dict = args;
8416
8417 while (--fmtcnt >= 0) {
8418 if (*fmt != '%') {
8419 if (--rescnt < 0) {
8420 rescnt = fmtcnt + 100;
8421 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008422 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008423 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8425 --rescnt;
8426 }
8427 *res++ = *fmt++;
8428 }
8429 else {
8430 /* Got a format specifier */
8431 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008432 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 Py_UNICODE c = '\0';
8435 Py_UNICODE fill;
8436 PyObject *v = NULL;
8437 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008438 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008440 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008441 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442
8443 fmt++;
8444 if (*fmt == '(') {
8445 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008446 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 PyObject *key;
8448 int pcount = 1;
8449
8450 if (dict == NULL) {
8451 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008452 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 goto onError;
8454 }
8455 ++fmt;
8456 --fmtcnt;
8457 keystart = fmt;
8458 /* Skip over balanced parentheses */
8459 while (pcount > 0 && --fmtcnt >= 0) {
8460 if (*fmt == ')')
8461 --pcount;
8462 else if (*fmt == '(')
8463 ++pcount;
8464 fmt++;
8465 }
8466 keylen = fmt - keystart - 1;
8467 if (fmtcnt < 0 || pcount > 0) {
8468 PyErr_SetString(PyExc_ValueError,
8469 "incomplete format key");
8470 goto onError;
8471 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008472#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008473 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474 then looked up since Python uses strings to hold
8475 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008476 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 key = PyUnicode_EncodeUTF8(keystart,
8478 keylen,
8479 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008480#else
8481 key = PyUnicode_FromUnicode(keystart, keylen);
8482#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 if (key == NULL)
8484 goto onError;
8485 if (args_owned) {
8486 Py_DECREF(args);
8487 args_owned = 0;
8488 }
8489 args = PyObject_GetItem(dict, key);
8490 Py_DECREF(key);
8491 if (args == NULL) {
8492 goto onError;
8493 }
8494 args_owned = 1;
8495 arglen = -1;
8496 argidx = -2;
8497 }
8498 while (--fmtcnt >= 0) {
8499 switch (c = *fmt++) {
8500 case '-': flags |= F_LJUST; continue;
8501 case '+': flags |= F_SIGN; continue;
8502 case ' ': flags |= F_BLANK; continue;
8503 case '#': flags |= F_ALT; continue;
8504 case '0': flags |= F_ZERO; continue;
8505 }
8506 break;
8507 }
8508 if (c == '*') {
8509 v = getnextarg(args, arglen, &argidx);
8510 if (v == NULL)
8511 goto onError;
8512 if (!PyInt_Check(v)) {
8513 PyErr_SetString(PyExc_TypeError,
8514 "* wants int");
8515 goto onError;
8516 }
8517 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008518 if (width == -1 && PyErr_Occurred())
8519 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 if (width < 0) {
8521 flags |= F_LJUST;
8522 width = -width;
8523 }
8524 if (--fmtcnt >= 0)
8525 c = *fmt++;
8526 }
8527 else if (c >= '0' && c <= '9') {
8528 width = c - '0';
8529 while (--fmtcnt >= 0) {
8530 c = *fmt++;
8531 if (c < '0' || c > '9')
8532 break;
8533 if ((width*10) / 10 != width) {
8534 PyErr_SetString(PyExc_ValueError,
8535 "width too big");
8536 goto onError;
8537 }
8538 width = width*10 + (c - '0');
8539 }
8540 }
8541 if (c == '.') {
8542 prec = 0;
8543 if (--fmtcnt >= 0)
8544 c = *fmt++;
8545 if (c == '*') {
8546 v = getnextarg(args, arglen, &argidx);
8547 if (v == NULL)
8548 goto onError;
8549 if (!PyInt_Check(v)) {
8550 PyErr_SetString(PyExc_TypeError,
8551 "* wants int");
8552 goto onError;
8553 }
8554 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008555 if (prec == -1 && PyErr_Occurred())
8556 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 if (prec < 0)
8558 prec = 0;
8559 if (--fmtcnt >= 0)
8560 c = *fmt++;
8561 }
8562 else if (c >= '0' && c <= '9') {
8563 prec = c - '0';
8564 while (--fmtcnt >= 0) {
8565 c = Py_CHARMASK(*fmt++);
8566 if (c < '0' || c > '9')
8567 break;
8568 if ((prec*10) / 10 != prec) {
8569 PyErr_SetString(PyExc_ValueError,
8570 "prec too big");
8571 goto onError;
8572 }
8573 prec = prec*10 + (c - '0');
8574 }
8575 }
8576 } /* prec */
8577 if (fmtcnt >= 0) {
8578 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 if (--fmtcnt >= 0)
8580 c = *fmt++;
8581 }
8582 }
8583 if (fmtcnt < 0) {
8584 PyErr_SetString(PyExc_ValueError,
8585 "incomplete format");
8586 goto onError;
8587 }
8588 if (c != '%') {
8589 v = getnextarg(args, arglen, &argidx);
8590 if (v == NULL)
8591 goto onError;
8592 }
8593 sign = 0;
8594 fill = ' ';
8595 switch (c) {
8596
8597 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008598 pbuf = formatbuf;
8599 /* presume that buffer length is at least 1 */
8600 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 len = 1;
8602 break;
8603
8604 case 's':
8605 case 'r':
8606 if (PyUnicode_Check(v) && c == 's') {
8607 temp = v;
8608 Py_INCREF(temp);
8609 }
8610 else {
8611 PyObject *unicode;
8612 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008613 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 else
8615 temp = PyObject_Repr(v);
8616 if (temp == NULL)
8617 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008618 if (PyUnicode_Check(temp))
8619 /* nothing to do */;
8620 else if (PyString_Check(temp)) {
8621 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008622 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008624 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008626 Py_DECREF(temp);
8627 temp = unicode;
8628 if (temp == NULL)
8629 goto onError;
8630 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008631 else {
8632 Py_DECREF(temp);
8633 PyErr_SetString(PyExc_TypeError,
8634 "%s argument has non-string str()");
8635 goto onError;
8636 }
8637 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008638 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 len = PyUnicode_GET_SIZE(temp);
8640 if (prec >= 0 && len > prec)
8641 len = prec;
8642 break;
8643
8644 case 'i':
8645 case 'd':
8646 case 'u':
8647 case 'o':
8648 case 'x':
8649 case 'X':
8650 if (c == 'i')
8651 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008652 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008653 temp = formatlong(v, flags, prec, c);
8654 if (!temp)
8655 goto onError;
8656 pbuf = PyUnicode_AS_UNICODE(temp);
8657 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008658 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008660 else {
8661 pbuf = formatbuf;
8662 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8663 flags, prec, c, v);
8664 if (len < 0)
8665 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008666 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008667 }
8668 if (flags & F_ZERO)
8669 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 break;
8671
8672 case 'e':
8673 case 'E':
8674 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008675 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 case 'g':
8677 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008678 if (c == 'F')
8679 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008680 pbuf = formatbuf;
8681 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8682 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 if (len < 0)
8684 goto onError;
8685 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008686 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 fill = '0';
8688 break;
8689
8690 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008691 pbuf = formatbuf;
8692 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 if (len < 0)
8694 goto onError;
8695 break;
8696
8697 default:
8698 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008699 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008700 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008701 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008702 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008703 (Py_ssize_t)(fmt - 1 -
8704 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 goto onError;
8706 }
8707 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008708 if (*pbuf == '-' || *pbuf == '+') {
8709 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 len--;
8711 }
8712 else if (flags & F_SIGN)
8713 sign = '+';
8714 else if (flags & F_BLANK)
8715 sign = ' ';
8716 else
8717 sign = 0;
8718 }
8719 if (width < len)
8720 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008721 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 reslen -= rescnt;
8723 rescnt = width + fmtcnt + 100;
8724 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008725 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008726 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008727 PyErr_NoMemory();
8728 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008729 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008730 if (_PyUnicode_Resize(&result, reslen) < 0) {
8731 Py_XDECREF(temp);
8732 goto onError;
8733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 res = PyUnicode_AS_UNICODE(result)
8735 + reslen - rescnt;
8736 }
8737 if (sign) {
8738 if (fill != ' ')
8739 *res++ = sign;
8740 rescnt--;
8741 if (width > len)
8742 width--;
8743 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008744 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008745 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008746 assert(pbuf[1] == c);
8747 if (fill != ' ') {
8748 *res++ = *pbuf++;
8749 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008750 }
Tim Petersfff53252001-04-12 18:38:48 +00008751 rescnt -= 2;
8752 width -= 2;
8753 if (width < 0)
8754 width = 0;
8755 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 if (width > len && !(flags & F_LJUST)) {
8758 do {
8759 --rescnt;
8760 *res++ = fill;
8761 } while (--width > len);
8762 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008763 if (fill == ' ') {
8764 if (sign)
8765 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008766 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008767 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008768 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008769 *res++ = *pbuf++;
8770 *res++ = *pbuf++;
8771 }
8772 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008773 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 res += len;
8775 rescnt -= len;
8776 while (--width >= len) {
8777 --rescnt;
8778 *res++ = ' ';
8779 }
8780 if (dict && (argidx < arglen) && c != '%') {
8781 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008782 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008783 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784 goto onError;
8785 }
8786 Py_XDECREF(temp);
8787 } /* '%' */
8788 } /* until end */
8789 if (argidx < arglen && !dict) {
8790 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008791 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 goto onError;
8793 }
8794
Thomas Woutersa96affe2006-03-12 00:29:36 +00008795 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8796 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 if (args_owned) {
8798 Py_DECREF(args);
8799 }
8800 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 return (PyObject *)result;
8802
8803 onError:
8804 Py_XDECREF(result);
8805 Py_DECREF(uformat);
8806 if (args_owned) {
8807 Py_DECREF(args);
8808 }
8809 return NULL;
8810}
8811
8812static PyBufferProcs unicode_as_buffer = {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008813 (getbufferproc) unicode_buffer_getbuffer,
8814 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008815};
8816
Jeremy Hylton938ace62002-07-17 16:30:39 +00008817static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008818unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8819
Tim Peters6d6c1a32001-08-02 04:15:00 +00008820static PyObject *
8821unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8822{
8823 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008824 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008825 char *encoding = NULL;
8826 char *errors = NULL;
8827
Guido van Rossume023fe02001-08-30 03:12:59 +00008828 if (type != &PyUnicode_Type)
8829 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008830 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8831 kwlist, &x, &encoding, &errors))
8832 return NULL;
8833 if (x == NULL)
8834 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008835 if (encoding == NULL && errors == NULL)
8836 return PyObject_Unicode(x);
8837 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008838 return PyUnicode_FromEncodedObject(x, encoding, errors);
8839}
8840
Guido van Rossume023fe02001-08-30 03:12:59 +00008841static PyObject *
8842unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8843{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008844 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008845 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008846
8847 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8848 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8849 if (tmp == NULL)
8850 return NULL;
8851 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008852 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008853 if (pnew == NULL) {
8854 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008855 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008856 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008857 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8858 if (pnew->str == NULL) {
8859 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008860 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008861 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008862 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008863 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008864 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8865 pnew->length = n;
8866 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008867 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008868 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008869}
8870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008871PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008872"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008873\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008874Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008875encoding defaults to the current default string encoding.\n\
8876errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008877
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008878static PyObject *unicode_iter(PyObject *seq);
8879
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008881 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008882 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 sizeof(PyUnicodeObject), /* tp_size */
8884 0, /* tp_itemsize */
8885 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008886 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008888 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008890 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008891 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008892 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008894 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 (hashfunc) unicode_hash, /* tp_hash*/
8896 0, /* tp_call*/
8897 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008898 PyObject_GenericGetAttr, /* tp_getattro */
8899 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008901 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8902 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008903 unicode_doc, /* tp_doc */
8904 0, /* tp_traverse */
8905 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008906 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008907 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008908 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008909 0, /* tp_iternext */
8910 unicode_methods, /* tp_methods */
8911 0, /* tp_members */
8912 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008913 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008914 0, /* tp_dict */
8915 0, /* tp_descr_get */
8916 0, /* tp_descr_set */
8917 0, /* tp_dictoffset */
8918 0, /* tp_init */
8919 0, /* tp_alloc */
8920 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008921 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922};
8923
8924/* Initialize the Unicode implementation */
8925
Thomas Wouters78890102000-07-22 19:25:51 +00008926void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008928 int i;
8929
Thomas Wouters477c8d52006-05-27 19:21:47 +00008930 /* XXX - move this array to unicodectype.c ? */
8931 Py_UNICODE linebreak[] = {
8932 0x000A, /* LINE FEED */
8933 0x000D, /* CARRIAGE RETURN */
8934 0x001C, /* FILE SEPARATOR */
8935 0x001D, /* GROUP SEPARATOR */
8936 0x001E, /* RECORD SEPARATOR */
8937 0x0085, /* NEXT LINE */
8938 0x2028, /* LINE SEPARATOR */
8939 0x2029, /* PARAGRAPH SEPARATOR */
8940 };
8941
Fred Drakee4315f52000-05-09 19:53:39 +00008942 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008943 unicode_freelist = NULL;
8944 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008946 if (!unicode_empty)
8947 return;
8948
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008949 for (i = 0; i < 256; i++)
8950 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008951 if (PyType_Ready(&PyUnicode_Type) < 0)
8952 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008953
8954 /* initialize the linebreak bloom filter */
8955 bloom_linebreak = make_bloom_mask(
8956 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8957 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008958
8959 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960}
8961
8962/* Finalize the Unicode implementation */
8963
8964void
Thomas Wouters78890102000-07-22 19:25:51 +00008965_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008967 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008968 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008970 Py_XDECREF(unicode_empty);
8971 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008972
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008973 for (i = 0; i < 256; i++) {
8974 if (unicode_latin1[i]) {
8975 Py_DECREF(unicode_latin1[i]);
8976 unicode_latin1[i] = NULL;
8977 }
8978 }
8979
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008980 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 PyUnicodeObject *v = u;
8982 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008983 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008984 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008985 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008986 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008988 unicode_freelist = NULL;
8989 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008991
Walter Dörwald16807132007-05-25 13:52:07 +00008992void
8993PyUnicode_InternInPlace(PyObject **p)
8994{
8995 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8996 PyObject *t;
8997 if (s == NULL || !PyUnicode_Check(s))
8998 Py_FatalError(
8999 "PyUnicode_InternInPlace: unicode strings only please!");
9000 /* If it's a subclass, we don't really know what putting
9001 it in the interned dict might do. */
9002 if (!PyUnicode_CheckExact(s))
9003 return;
9004 if (PyUnicode_CHECK_INTERNED(s))
9005 return;
9006 if (interned == NULL) {
9007 interned = PyDict_New();
9008 if (interned == NULL) {
9009 PyErr_Clear(); /* Don't leave an exception */
9010 return;
9011 }
9012 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009013 /* It might be that the GetItem call fails even
9014 though the key is present in the dictionary,
9015 namely when this happens during a stack overflow. */
9016 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009017 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009018 Py_END_ALLOW_RECURSION
9019
Walter Dörwald16807132007-05-25 13:52:07 +00009020 if (t) {
9021 Py_INCREF(t);
9022 Py_DECREF(*p);
9023 *p = t;
9024 return;
9025 }
9026
Martin v. Löwis5b222132007-06-10 09:51:05 +00009027 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009028 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9029 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009030 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009031 return;
9032 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009033 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009034 /* The two references in interned are not counted by refcnt.
9035 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009036 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009037 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9038}
9039
9040void
9041PyUnicode_InternImmortal(PyObject **p)
9042{
9043 PyUnicode_InternInPlace(p);
9044 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9045 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9046 Py_INCREF(*p);
9047 }
9048}
9049
9050PyObject *
9051PyUnicode_InternFromString(const char *cp)
9052{
9053 PyObject *s = PyUnicode_FromString(cp);
9054 if (s == NULL)
9055 return NULL;
9056 PyUnicode_InternInPlace(&s);
9057 return s;
9058}
9059
9060void _Py_ReleaseInternedUnicodeStrings(void)
9061{
9062 PyObject *keys;
9063 PyUnicodeObject *s;
9064 Py_ssize_t i, n;
9065 Py_ssize_t immortal_size = 0, mortal_size = 0;
9066
9067 if (interned == NULL || !PyDict_Check(interned))
9068 return;
9069 keys = PyDict_Keys(interned);
9070 if (keys == NULL || !PyList_Check(keys)) {
9071 PyErr_Clear();
9072 return;
9073 }
9074
9075 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9076 detector, interned unicode strings are not forcibly deallocated;
9077 rather, we give them their stolen references back, and then clear
9078 and DECREF the interned dict. */
9079
9080 n = PyList_GET_SIZE(keys);
9081 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9082 n);
9083 for (i = 0; i < n; i++) {
9084 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9085 switch (s->state) {
9086 case SSTATE_NOT_INTERNED:
9087 /* XXX Shouldn't happen */
9088 break;
9089 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009090 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009091 immortal_size += s->length;
9092 break;
9093 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009094 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009095 mortal_size += s->length;
9096 break;
9097 default:
9098 Py_FatalError("Inconsistent interned string state.");
9099 }
9100 s->state = SSTATE_NOT_INTERNED;
9101 }
9102 fprintf(stderr, "total size of all interned strings: "
9103 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9104 "mortal/immortal\n", mortal_size, immortal_size);
9105 Py_DECREF(keys);
9106 PyDict_Clear(interned);
9107 Py_DECREF(interned);
9108 interned = NULL;
9109}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009110
9111
9112/********************* Unicode Iterator **************************/
9113
9114typedef struct {
9115 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009116 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009117 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9118} unicodeiterobject;
9119
9120static void
9121unicodeiter_dealloc(unicodeiterobject *it)
9122{
9123 _PyObject_GC_UNTRACK(it);
9124 Py_XDECREF(it->it_seq);
9125 PyObject_GC_Del(it);
9126}
9127
9128static int
9129unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9130{
9131 Py_VISIT(it->it_seq);
9132 return 0;
9133}
9134
9135static PyObject *
9136unicodeiter_next(unicodeiterobject *it)
9137{
9138 PyUnicodeObject *seq;
9139 PyObject *item;
9140
9141 assert(it != NULL);
9142 seq = it->it_seq;
9143 if (seq == NULL)
9144 return NULL;
9145 assert(PyUnicode_Check(seq));
9146
9147 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009148 item = PyUnicode_FromUnicode(
9149 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009150 if (item != NULL)
9151 ++it->it_index;
9152 return item;
9153 }
9154
9155 Py_DECREF(seq);
9156 it->it_seq = NULL;
9157 return NULL;
9158}
9159
9160static PyObject *
9161unicodeiter_len(unicodeiterobject *it)
9162{
9163 Py_ssize_t len = 0;
9164 if (it->it_seq)
9165 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9166 return PyInt_FromSsize_t(len);
9167}
9168
9169PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9170
9171static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009172 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9173 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009174 {NULL, NULL} /* sentinel */
9175};
9176
9177PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009178 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009179 "unicodeiterator", /* tp_name */
9180 sizeof(unicodeiterobject), /* tp_basicsize */
9181 0, /* tp_itemsize */
9182 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009183 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009184 0, /* tp_print */
9185 0, /* tp_getattr */
9186 0, /* tp_setattr */
9187 0, /* tp_compare */
9188 0, /* tp_repr */
9189 0, /* tp_as_number */
9190 0, /* tp_as_sequence */
9191 0, /* tp_as_mapping */
9192 0, /* tp_hash */
9193 0, /* tp_call */
9194 0, /* tp_str */
9195 PyObject_GenericGetAttr, /* tp_getattro */
9196 0, /* tp_setattro */
9197 0, /* tp_as_buffer */
9198 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9199 0, /* tp_doc */
9200 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9201 0, /* tp_clear */
9202 0, /* tp_richcompare */
9203 0, /* tp_weaklistoffset */
9204 PyObject_SelfIter, /* tp_iter */
9205 (iternextfunc)unicodeiter_next, /* tp_iternext */
9206 unicodeiter_methods, /* tp_methods */
9207 0,
9208};
9209
9210static PyObject *
9211unicode_iter(PyObject *seq)
9212{
9213 unicodeiterobject *it;
9214
9215 if (!PyUnicode_Check(seq)) {
9216 PyErr_BadInternalCall();
9217 return NULL;
9218 }
9219 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9220 if (it == NULL)
9221 return NULL;
9222 it->it_index = 0;
9223 Py_INCREF(seq);
9224 it->it_seq = (PyUnicodeObject *)seq;
9225 _PyObject_GC_TRACK(it);
9226 return (PyObject *)it;
9227}
9228
Martin v. Löwis5b222132007-06-10 09:51:05 +00009229size_t
9230Py_UNICODE_strlen(const Py_UNICODE *u)
9231{
9232 int res = 0;
9233 while(*u++)
9234 res++;
9235 return res;
9236}
9237
9238Py_UNICODE*
9239Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9240{
9241 Py_UNICODE *u = s1;
9242 while ((*u++ = *s2++));
9243 return s1;
9244}
9245
9246Py_UNICODE*
9247Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9248{
9249 Py_UNICODE *u = s1;
9250 while ((*u++ = *s2++))
9251 if (n-- == 0)
9252 break;
9253 return s1;
9254}
9255
9256int
9257Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9258{
9259 while (*s1 && *s2 && *s1 == *s2)
9260 s1++, s2++;
9261 if (*s1 && *s2)
9262 return (*s1 < *s2) ? -1 : +1;
9263 if (*s1)
9264 return 1;
9265 if (*s2)
9266 return -1;
9267 return 0;
9268}
9269
9270Py_UNICODE*
9271Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9272{
9273 const Py_UNICODE *p;
9274 for (p = s; *p; p++)
9275 if (*p == c)
9276 return (Py_UNICODE*)p;
9277 return NULL;
9278}
9279
9280
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009281#ifdef __cplusplus
9282}
9283#endif
9284
9285
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009286/*
9287Local variables:
9288c-basic-offset: 4
9289indent-tabs-mode: nil
9290End:
9291*/