blob: def9011d3baa784ee1a9eae64990e6aff5bd31ed [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
120 PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000122
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000124PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000126#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127 return 0x10FFFF;
128#else
129 /* This is actually an illegal character, so it should
130 not be passed to unichr. */
131 return 0xFFFF;
132#endif
133}
134
Thomas Wouters477c8d52006-05-27 19:21:47 +0000135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138 to keep things simple, we use a single bitmask, using the least 5
139 bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154 /* calculate simple bloom-style bitmask for a given unicode string */
155
156 long mask;
157 Py_ssize_t i;
158
159 mask = 0;
160 for (i = 0; i < len; i++)
161 mask |= (1 << (ptr[i] & 0x1F));
162
163 return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168 Py_ssize_t i;
169
170 for (i = 0; i < setlen; i++)
171 if (set[i] == chr)
172 return 1;
173
174 return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
Guido van Rossumd57fd912000-03-10 22:53:23 +0000180/* --- Unicode Object ----------------------------------------------------- */
181
182static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000184 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185{
186 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000187
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000191
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192 /* Resizing shared object (unicode_empty or single character
193 objects) in-place is not allowed. Use PyUnicode_Resize()
194 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000195
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 /* We allocate one more byte to make sure the string is Ux0000 terminated.
206 The overallocation is also used by fastsearch, which assumes that it's
207 safe to look at str[length] (without making any assumptions about what
208 it contains). */
209
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 oldstr = unicode->str;
211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 PyErr_NoMemory();
215 return -1;
216 }
217 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000222 if (unicode->defenc) {
223 Py_DECREF(unicode->defenc);
224 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000227
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 return 0;
229}
230
231/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000232 Ux0000 terminated; some code (e.g. new_identifier)
233 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234
235 XXX This allocator could further be enhanced by assuring that the
236 free list never reduces its size below 1.
237
238*/
239
240static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
243 register PyUnicodeObject *unicode;
244
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 if (length == 0 && unicode_empty != NULL) {
247 Py_INCREF(unicode_empty);
248 return unicode_empty;
249 }
250
251 /* Unicode freelist & memory allocation */
252 if (unicode_freelist) {
253 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000254 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 /* Keep-Alive optimization: we only upsize the buffer,
258 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000259 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000260 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000267 }
268 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 }
270 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 if (unicode == NULL)
273 return NULL;
274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000292 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000293 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000295
296 onError:
297 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000298 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300}
301
302static
Guido van Rossum9475a232001-10-05 20:51:39 +0000303void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304{
Walter Dörwald16807132007-05-25 13:52:07 +0000305 switch (PyUnicode_CHECK_INTERNED(unicode)) {
306 case SSTATE_NOT_INTERNED:
307 break;
308
309 case SSTATE_INTERNED_MORTAL:
310 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000311 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313 Py_FatalError(
314 "deletion of interned unicode string failed");
315 break;
316
317 case SSTATE_INTERNED_IMMORTAL:
318 Py_FatalError("Immortal interned unicode string died.");
319
320 default:
321 Py_FatalError("Inconsistent interned unicode string state.");
322 }
323
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 if (PyUnicode_CheckExact(unicode) &&
325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000326 /* Keep-Alive optimization */
327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000328 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 unicode->str = NULL;
330 unicode->length = 0;
331 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000332 if (unicode->defenc) {
333 Py_DECREF(unicode->defenc);
334 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000335 }
336 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 *(PyUnicodeObject **)unicode = unicode_freelist;
338 unicode_freelist = unicode;
339 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 }
341 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000342 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000343 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000344 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346}
347
Martin v. Löwis18e16552006-02-15 17:27:45 +0000348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349{
350 register PyUnicodeObject *v;
351
352 /* Argument checks */
353 if (unicode == NULL) {
354 PyErr_BadInternalCall();
355 return -1;
356 }
357 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 PyErr_BadInternalCall();
360 return -1;
361 }
362
363 /* Resizing unicode_empty and single character objects is not
364 possible since these are being shared. We simply return a fresh
365 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000366 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 (v == unicode_empty || v->length == 1)) {
368 PyUnicodeObject *w = _PyUnicode_New(length);
369 if (w == NULL)
370 return -1;
371 Py_UNICODE_COPY(w->str, v->str,
372 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000373 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000374 *unicode = (PyObject *)w;
375 return 0;
376 }
377
378 /* Note that we don't have to modify *unicode for unshared Unicode
379 objects, since we can modify them in-place. */
380 return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
390 PyUnicodeObject *unicode;
391
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000392 /* If the Unicode data is known at construction time, we can apply
393 some optimizations which share commonly used objects. */
394 if (u != NULL) {
395
396 /* Optimization for empty strings */
397 if (size == 0 && unicode_empty != NULL) {
398 Py_INCREF(unicode_empty);
399 return (PyObject *)unicode_empty;
400 }
401
402 /* Single character Unicode objects in the Latin-1 range are
403 shared when using this constructor */
404 if (size == 1 && *u < 256) {
405 unicode = unicode_latin1[*u];
406 if (!unicode) {
407 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 if (!unicode)
409 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000410 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 unicode_latin1[*u] = unicode;
412 }
413 Py_INCREF(unicode);
414 return (PyObject *)unicode;
415 }
416 }
Tim Petersced69f82003-09-16 20:30:58 +0000417
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 unicode = _PyUnicode_New(size);
419 if (!unicode)
420 return NULL;
421
422 /* Copy the Unicode data into the new object */
423 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425
426 return (PyObject *)unicode;
427}
428
Walter Dörwaldd2034312007-05-18 16:29:38 +0000429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430{
431 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000433 some optimizations which share commonly used objects.
434 Also, this means the input must be UTF-8, so fall back to the
435 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 if (u != NULL) {
437
438 /* Optimization for empty strings */
439 if (size == 0 && unicode_empty != NULL) {
440 Py_INCREF(unicode_empty);
441 return (PyObject *)unicode_empty;
442 }
443
Martin v. Löwis9c121062007-08-05 20:26:11 +0000444 /* Single characters are shared when using this constructor.
445 Restrict to ASCII, since the input must be UTF-8. */
446 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000447 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 if (!unicode) {
449 unicode = _PyUnicode_New(1);
450 if (!unicode)
451 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode->str[0] = Py_CHARMASK(*u);
453 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000454 }
455 Py_INCREF(unicode);
456 return (PyObject *)unicode;
457 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000458
459 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 }
461
Walter Dörwald55507312007-05-18 13:12:10 +0000462 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 if (!unicode)
464 return NULL;
465
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000466 return (PyObject *)unicode;
467}
468
Walter Dörwaldd2034312007-05-18 16:29:38 +0000469PyObject *PyUnicode_FromString(const char *u)
470{
471 size_t size = strlen(u);
472 if (size > PY_SSIZE_T_MAX) {
473 PyErr_SetString(PyExc_OverflowError, "input too long");
474 return NULL;
475 }
476
477 return PyUnicode_FromStringAndSize(u, size);
478}
479
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000483 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484{
485 PyUnicodeObject *unicode;
486
487 if (w == NULL) {
488 PyErr_BadInternalCall();
489 return NULL;
490 }
491
492 unicode = _PyUnicode_New(size);
493 if (!unicode)
494 return NULL;
495
496 /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000499#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 {
501 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000504 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 *u++ = *w++;
506 }
507#endif
508
509 return (PyObject *)unicode;
510}
511
Walter Dörwald346737f2007-05-31 10:44:43 +0000512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515 *fmt++ = '%';
516 if (width) {
517 if (zeropad)
518 *fmt++ = '0';
519 fmt += sprintf(fmt, "%d", width);
520 }
521 if (precision)
522 fmt += sprintf(fmt, ".%d", precision);
523 if (longflag)
524 *fmt++ = 'l';
525 else if (size_tflag) {
526 char *f = PY_FORMAT_SIZE_T;
527 while (*f)
528 *fmt++ = *f++;
529 }
530 *fmt++ = c;
531 *fmt = '\0';
532}
533
Walter Dörwaldd2034312007-05-18 16:29:38 +0000534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000540 Py_ssize_t callcount = 0;
541 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000542 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000543 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000544 int width = 0;
545 int precision = 0;
546 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 const char* f;
548 Py_UNICODE *s;
549 PyObject *string;
550 /* used by sprintf */
551 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000552 /* use abuffer instead of buffer, if we need more space
553 * (which can happen if there's a format specifier with width). */
554 char *abuffer = NULL;
555 char *realbuffer;
556 Py_ssize_t abuffersize = 0;
557 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000558 const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561 Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef __va_copy
564 __va_copy(count, vargs);
565#else
566 count = vargs;
567#endif
568#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000569 /* step 1: count the number of %S/%R format specifications
570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000574 ++callcount;
575 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 /* step 2: allocate memory for the results of
577 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 if (callcount) {
579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580 if (!callresults) {
581 PyErr_NoMemory();
582 return NULL;
583 }
584 callresult = callresults;
585 }
586 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000587 for (f = format; *f; f++) {
588 if (*f == '%') {
589 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000590 width = 0;
591 while (isdigit(Py_CHARMASK(*f)))
592 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594 ;
595
596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597 * they don't affect the amount of space we reserve.
598 */
599 if ((*f == 'l' || *f == 'z') &&
600 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000601 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000602
603 switch (*f) {
604 case 'c':
605 (void)va_arg(count, int);
606 /* fall through... */
607 case '%':
608 n++;
609 break;
610 case 'd': case 'u': case 'i': case 'x':
611 (void) va_arg(count, int);
612 /* 20 bytes is enough to hold a 64-bit
613 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000614 This isn't enough for octal.
615 If a width is specified we need more
616 (which we allocate later). */
617 if (width < 20)
618 width = 20;
619 n += width;
620 if (abuffersize < width)
621 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 break;
623 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000624 {
625 /* UTF-8 */
626 unsigned char*s;
627 s = va_arg(count, unsigned char*);
628 while (*s) {
629 if (*s < 128) {
630 n++; s++;
631 } else if (*s < 0xc0) {
632 /* invalid UTF-8 */
633 n++; s++;
634 } else if (*s < 0xc0) {
635 n++;
636 s++; if(!*s)break;
637 s++;
638 } else if (*s < 0xe0) {
639 n++;
640 s++; if(!*s)break;
641 s++; if(!*s)break;
642 s++;
643 } else {
644 #ifdef Py_UNICODE_WIDE
645 n++;
646 #else
647 n+=2;
648 #endif
649 s++; if(!*s)break;
650 s++; if(!*s)break;
651 s++; if(!*s)break;
652 s++;
653 }
654 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000655 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000656 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000657 case 'U':
658 {
659 PyObject *obj = va_arg(count, PyObject *);
660 assert(obj && PyUnicode_Check(obj));
661 n += PyUnicode_GET_SIZE(obj);
662 break;
663 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000664 case 'V':
665 {
666 PyObject *obj = va_arg(count, PyObject *);
667 const char *str = va_arg(count, const char *);
668 assert(obj || str);
669 assert(!obj || PyUnicode_Check(obj));
670 if (obj)
671 n += PyUnicode_GET_SIZE(obj);
672 else
673 n += strlen(str);
674 break;
675 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000676 case 'S':
677 {
678 PyObject *obj = va_arg(count, PyObject *);
679 PyObject *str;
680 assert(obj);
681 str = PyObject_Unicode(obj);
682 if (!str)
683 goto fail;
684 n += PyUnicode_GET_SIZE(str);
685 /* Remember the str and switch to the next slot */
686 *callresult++ = str;
687 break;
688 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000689 case 'R':
690 {
691 PyObject *obj = va_arg(count, PyObject *);
692 PyObject *repr;
693 assert(obj);
694 repr = PyObject_Repr(obj);
695 if (!repr)
696 goto fail;
697 n += PyUnicode_GET_SIZE(repr);
698 /* Remember the repr and switch to the next slot */
699 *callresult++ = repr;
700 break;
701 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000702 case 'p':
703 (void) va_arg(count, int);
704 /* maximum 64-bit pointer representation:
705 * 0xffffffffffffffff
706 * so 19 characters is enough.
707 * XXX I count 18 -- what's the extra for?
708 */
709 n += 19;
710 break;
711 default:
712 /* if we stumble upon an unknown
713 formatting code, copy the rest of
714 the format string to the output
715 string. (we cannot just skip the
716 code, since there's no way to know
717 what's in the argument list) */
718 n += strlen(p);
719 goto expand;
720 }
721 } else
722 n++;
723 }
724 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000725 if (abuffersize > 20) {
726 abuffer = PyMem_Malloc(abuffersize);
727 if (!abuffer) {
728 PyErr_NoMemory();
729 goto fail;
730 }
731 realbuffer = abuffer;
732 }
733 else
734 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000735 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000737 we don't have to resize the string.
738 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000739 string = PyUnicode_FromUnicode(NULL, n);
740 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000741 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000742
743 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000744 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
746 for (f = format; *f; f++) {
747 if (*f == '%') {
748 const char* p = f++;
749 int longflag = 0;
750 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 zeropad = (*f == '0');
752 /* parse the width.precision part */
753 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 width = (width*10) + *f++ - '0';
756 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 if (*f == '.') {
758 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 /* handle the long flag, but only for %ld and %lu.
763 others can be added when necessary. */
764 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
765 longflag = 1;
766 ++f;
767 }
768 /* handle the size_t flag. */
769 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
770 size_tflag = 1;
771 ++f;
772 }
773
774 switch (*f) {
775 case 'c':
776 *s++ = va_arg(vargs, int);
777 break;
778 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000779 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000780 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000781 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000783 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000784 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000785 sprintf(realbuffer, fmt, va_arg(vargs, int));
786 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 break;
788 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000789 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000790 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000791 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000792 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000793 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000795 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
796 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 break;
798 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000799 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
800 sprintf(realbuffer, fmt, va_arg(vargs, int));
801 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 break;
803 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
805 sprintf(realbuffer, fmt, va_arg(vargs, int));
806 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 break;
808 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000809 {
810 /* Parameter must be UTF-8 encoded.
811 In case of encoding errors, use
812 the replacement character. */
813 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000814 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000815 u = PyUnicode_DecodeUTF8(p, strlen(p),
816 "replace");
817 if (!u)
818 goto fail;
819 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
820 PyUnicode_GET_SIZE(u));
821 s += PyUnicode_GET_SIZE(u);
822 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000823 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000824 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000825 case 'U':
826 {
827 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000828 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
829 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
830 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000831 break;
832 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000833 case 'V':
834 {
835 PyObject *obj = va_arg(vargs, PyObject *);
836 const char *str = va_arg(vargs, const char *);
837 if (obj) {
838 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
839 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
840 s += size;
841 } else {
842 appendstring(str);
843 }
844 break;
845 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000846 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000847 case 'R':
848 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000849 Py_UNICODE *ucopy;
850 Py_ssize_t usize;
851 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 /* unused, since we already have the result */
853 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000854 ucopy = PyUnicode_AS_UNICODE(*callresult);
855 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000856 for (upos = 0; upos<usize;)
857 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000858 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000859 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000860 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 ++callresult;
862 break;
863 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000864 case 'p':
865 sprintf(buffer, "%p", va_arg(vargs, void*));
866 /* %p is ill-defined: ensure leading 0x. */
867 if (buffer[1] == 'X')
868 buffer[1] = 'x';
869 else if (buffer[1] != 'x') {
870 memmove(buffer+2, buffer, strlen(buffer)+1);
871 buffer[0] = '0';
872 buffer[1] = 'x';
873 }
874 appendstring(buffer);
875 break;
876 case '%':
877 *s++ = '%';
878 break;
879 default:
880 appendstring(p);
881 goto end;
882 }
883 } else
884 *s++ = *f;
885 }
886
887 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000888 if (callresults)
889 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000890 if (abuffer)
891 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
893 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000894 fail:
895 if (callresults) {
896 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000897 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000898 Py_DECREF(*callresult2);
899 ++callresult2;
900 }
901 PyMem_Free(callresults);
902 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000903 if (abuffer)
904 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000905 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000906}
907
908#undef appendstring
909
910PyObject *
911PyUnicode_FromFormat(const char *format, ...)
912{
913 PyObject* ret;
914 va_list vargs;
915
916#ifdef HAVE_STDARG_PROTOTYPES
917 va_start(vargs, format);
918#else
919 va_start(vargs);
920#endif
921 ret = PyUnicode_FromFormatV(format, vargs);
922 va_end(vargs);
923 return ret;
924}
925
Martin v. Löwis18e16552006-02-15 17:27:45 +0000926Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
927 wchar_t *w,
928 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000929{
930 if (unicode == NULL) {
931 PyErr_BadInternalCall();
932 return -1;
933 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000934
935 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000937 size = PyUnicode_GET_SIZE(unicode) + 1;
938
Guido van Rossumd57fd912000-03-10 22:53:23 +0000939#ifdef HAVE_USABLE_WCHAR_T
940 memcpy(w, unicode->str, size * sizeof(wchar_t));
941#else
942 {
943 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000944 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000945 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000946 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 *w++ = *u++;
948 }
949#endif
950
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000951 if (size > PyUnicode_GET_SIZE(unicode))
952 return PyUnicode_GET_SIZE(unicode);
953 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000954 return size;
955}
956
957#endif
958
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000959PyObject *PyUnicode_FromOrdinal(int ordinal)
960{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000961 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000962
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000963 if (ordinal < 0 || ordinal > 0x10ffff) {
964 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000965 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000966 return NULL;
967 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000968
969#ifndef Py_UNICODE_WIDE
970 if (ordinal > 0xffff) {
971 ordinal -= 0x10000;
972 s[0] = 0xD800 | (ordinal >> 10);
973 s[1] = 0xDC00 | (ordinal & 0x3FF);
974 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000975 }
976#endif
977
Hye-Shik Chang40574832004-04-06 07:24:51 +0000978 s[0] = (Py_UNICODE)ordinal;
979 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000980}
981
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982PyObject *PyUnicode_FromObject(register PyObject *obj)
983{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 /* XXX Perhaps we should make this API an alias of
985 PyObject_Unicode() instead ?! */
986 if (PyUnicode_CheckExact(obj)) {
987 Py_INCREF(obj);
988 return obj;
989 }
990 if (PyUnicode_Check(obj)) {
991 /* For a Unicode subtype that's not a Unicode object,
992 return a true Unicode object with the same data. */
993 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
994 PyUnicode_GET_SIZE(obj));
995 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000996 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
997}
998
999PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1000 const char *encoding,
1001 const char *errors)
1002{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001003 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001005 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001006
Guido van Rossumd57fd912000-03-10 22:53:23 +00001007 if (obj == NULL) {
1008 PyErr_BadInternalCall();
1009 return NULL;
1010 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001011
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001012 if (PyUnicode_Check(obj)) {
1013 PyErr_SetString(PyExc_TypeError,
1014 "decoding Unicode is not supported");
1015 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001017
1018 /* Coerce object */
1019 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001020 s = PyString_AS_STRING(obj);
1021 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001022 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001023 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1024 /* Overwrite the error message with something more useful in
1025 case of a TypeError. */
1026 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001027 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001028 "coercing to Unicode: need string or buffer, "
1029 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001030 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001031 goto onError;
1032 }
Tim Petersced69f82003-09-16 20:30:58 +00001033
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001034 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001035 if (len == 0) {
1036 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001037 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 }
Tim Petersced69f82003-09-16 20:30:58 +00001039 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001040 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001041
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 return v;
1043
1044 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046}
1047
1048PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001049 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050 const char *encoding,
1051 const char *errors)
1052{
1053 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001054 Py_buffer info;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001055
1056 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001057 encoding = PyUnicode_GetDefaultEncoding();
1058
1059 /* Shortcuts for common default encodings */
1060 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001062 else if (strcmp(encoding, "latin-1") == 0)
1063 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001064#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1065 else if (strcmp(encoding, "mbcs") == 0)
1066 return PyUnicode_DecodeMBCS(s, size, errors);
1067#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001068 else if (strcmp(encoding, "ascii") == 0)
1069 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070
1071 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001072 buffer = NULL;
1073 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1074 goto onError;
1075 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 if (buffer == NULL)
1077 goto onError;
1078 unicode = PyCodec_Decode(buffer, encoding, errors);
1079 if (unicode == NULL)
1080 goto onError;
1081 if (!PyUnicode_Check(unicode)) {
1082 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001083 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001084 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 Py_DECREF(unicode);
1086 goto onError;
1087 }
1088 Py_DECREF(buffer);
1089 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001090
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 onError:
1092 Py_XDECREF(buffer);
1093 return NULL;
1094}
1095
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001096PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1097 const char *encoding,
1098 const char *errors)
1099{
1100 PyObject *v;
1101
1102 if (!PyUnicode_Check(unicode)) {
1103 PyErr_BadArgument();
1104 goto onError;
1105 }
1106
1107 if (encoding == NULL)
1108 encoding = PyUnicode_GetDefaultEncoding();
1109
1110 /* Decode via the codec registry */
1111 v = PyCodec_Decode(unicode, encoding, errors);
1112 if (v == NULL)
1113 goto onError;
1114 return v;
1115
1116 onError:
1117 return NULL;
1118}
1119
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001121 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 const char *encoding,
1123 const char *errors)
1124{
1125 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001126
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 unicode = PyUnicode_FromUnicode(s, size);
1128 if (unicode == NULL)
1129 return NULL;
1130 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1131 Py_DECREF(unicode);
1132 return v;
1133}
1134
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001135PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1136 const char *encoding,
1137 const char *errors)
1138{
1139 PyObject *v;
1140
1141 if (!PyUnicode_Check(unicode)) {
1142 PyErr_BadArgument();
1143 goto onError;
1144 }
1145
1146 if (encoding == NULL)
1147 encoding = PyUnicode_GetDefaultEncoding();
1148
1149 /* Encode via the codec registry */
1150 v = PyCodec_Encode(unicode, encoding, errors);
1151 if (v == NULL)
1152 goto onError;
1153 return v;
1154
1155 onError:
1156 return NULL;
1157}
1158
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1160 const char *encoding,
1161 const char *errors)
1162{
1163 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001164
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (!PyUnicode_Check(unicode)) {
1166 PyErr_BadArgument();
1167 goto onError;
1168 }
Fred Drakee4315f52000-05-09 19:53:39 +00001169
Tim Petersced69f82003-09-16 20:30:58 +00001170 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001171 encoding = PyUnicode_GetDefaultEncoding();
1172
1173 /* Shortcuts for common default encodings */
1174 if (errors == NULL) {
1175 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001176 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001177 else if (strcmp(encoding, "latin-1") == 0)
1178 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001179#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1180 else if (strcmp(encoding, "mbcs") == 0)
1181 return PyUnicode_AsMBCSString(unicode);
1182#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001183 else if (strcmp(encoding, "ascii") == 0)
1184 return PyUnicode_AsASCIIString(unicode);
1185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186
1187 /* Encode via the codec registry */
1188 v = PyCodec_Encode(unicode, encoding, errors);
1189 if (v == NULL)
1190 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001191 if (!PyBytes_Check(v)) {
1192 if (PyString_Check(v)) {
1193 /* Old codec, turn it into bytes */
1194 PyObject *b = PyBytes_FromObject(v);
1195 Py_DECREF(v);
1196 return b;
1197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001199 "encoder did not return a bytes object "
1200 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1201 v->ob_type->tp_name,
1202 encoding ? encoding : "NULL",
1203 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 Py_DECREF(v);
1205 goto onError;
1206 }
1207 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001208
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 onError:
1210 return NULL;
1211}
1212
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001213PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1214 const char *errors)
1215{
1216 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001217 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001218 if (v)
1219 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001220 if (errors != NULL)
1221 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001222 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1223 PyUnicode_GET_SIZE(unicode),
1224 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001225 if (!b)
1226 return NULL;
1227 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1228 PyBytes_Size(b));
1229 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001230 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001231 return v;
1232}
1233
Martin v. Löwis5b222132007-06-10 09:51:05 +00001234char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001235PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001236{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001237 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001238 if (!PyUnicode_Check(unicode)) {
1239 PyErr_BadArgument();
1240 return NULL;
1241 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001242 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1243 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001244 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001245 if (psize != NULL)
1246 *psize = PyString_GET_SIZE(str8);
1247 return PyString_AS_STRING(str8);
1248}
1249
1250char*
1251PyUnicode_AsString(PyObject *unicode)
1252{
1253 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001254}
1255
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1257{
1258 if (!PyUnicode_Check(unicode)) {
1259 PyErr_BadArgument();
1260 goto onError;
1261 }
1262 return PyUnicode_AS_UNICODE(unicode);
1263
1264 onError:
1265 return NULL;
1266}
1267
Martin v. Löwis18e16552006-02-15 17:27:45 +00001268Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269{
1270 if (!PyUnicode_Check(unicode)) {
1271 PyErr_BadArgument();
1272 goto onError;
1273 }
1274 return PyUnicode_GET_SIZE(unicode);
1275
1276 onError:
1277 return -1;
1278}
1279
Thomas Wouters78890102000-07-22 19:25:51 +00001280const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001281{
1282 return unicode_default_encoding;
1283}
1284
1285int PyUnicode_SetDefaultEncoding(const char *encoding)
1286{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001287 if (strcmp(encoding, unicode_default_encoding) != 0) {
1288 PyErr_Format(PyExc_ValueError,
1289 "Can only set default encoding to %s",
1290 unicode_default_encoding);
1291 return -1;
1292 }
Fred Drakee4315f52000-05-09 19:53:39 +00001293 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001294}
1295
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001296/* error handling callback helper:
1297 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001298 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 and adjust various state variables.
1300 return 0 on success, -1 on error
1301*/
1302
1303static
1304int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1305 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001306 const char **input, const char **inend, Py_ssize_t *startinpos,
1307 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001308 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001309{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001310 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001311
1312 PyObject *restuple = NULL;
1313 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001314 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001315 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001316 Py_ssize_t requiredsize;
1317 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001319 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001320 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001321 int res = -1;
1322
1323 if (*errorHandler == NULL) {
1324 *errorHandler = PyCodec_LookupError(errors);
1325 if (*errorHandler == NULL)
1326 goto onError;
1327 }
1328
1329 if (*exceptionObject == NULL) {
1330 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001331 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 if (*exceptionObject == NULL)
1333 goto onError;
1334 }
1335 else {
1336 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1337 goto onError;
1338 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1339 goto onError;
1340 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1341 goto onError;
1342 }
1343
1344 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1345 if (restuple == NULL)
1346 goto onError;
1347 if (!PyTuple_Check(restuple)) {
1348 PyErr_Format(PyExc_TypeError, &argparse[4]);
1349 goto onError;
1350 }
1351 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1352 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001353
1354 /* Copy back the bytes variables, which might have been modified by the
1355 callback */
1356 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1357 if (!inputobj)
1358 goto onError;
1359 if (!PyBytes_Check(inputobj)) {
1360 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1361 }
1362 *input = PyBytes_AS_STRING(inputobj);
1363 insize = PyBytes_GET_SIZE(inputobj);
1364 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001365 /* we can DECREF safely, as the exception has another reference,
1366 so the object won't go away. */
1367 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001369 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001370 newpos = insize+newpos;
1371 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001372 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001373 goto onError;
1374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375
1376 /* need more space? (at least enough for what we
1377 have+the replacement+the rest of the string (starting
1378 at the new input position), so we won't have to check space
1379 when there are no errors in the rest of the string) */
1380 repptr = PyUnicode_AS_UNICODE(repunicode);
1381 repsize = PyUnicode_GET_SIZE(repunicode);
1382 requiredsize = *outpos + repsize + insize-newpos;
1383 if (requiredsize > outsize) {
1384 if (requiredsize<2*outsize)
1385 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001386 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387 goto onError;
1388 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1389 }
1390 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001391 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001392 Py_UNICODE_COPY(*outptr, repptr, repsize);
1393 *outptr += repsize;
1394 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 /* we made it! */
1397 res = 0;
1398
1399 onError:
1400 Py_XDECREF(restuple);
1401 return res;
1402}
1403
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001404/* --- UTF-7 Codec -------------------------------------------------------- */
1405
1406/* see RFC2152 for details */
1407
Tim Petersced69f82003-09-16 20:30:58 +00001408static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001409char utf7_special[128] = {
1410 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1411 encoded:
1412 0 - not special
1413 1 - special
1414 2 - whitespace (optional)
1415 3 - RFC2152 Set O (optional) */
1416 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1417 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1418 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1420 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1422 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1424
1425};
1426
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001427/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1428 warnings about the comparison always being false; since
1429 utf7_special[0] is 1, we can safely make that one comparison
1430 true */
1431
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001432#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001433 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001434 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001435 (encodeO && (utf7_special[(c)] == 3)))
1436
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001437#define B64(n) \
1438 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1439#define B64CHAR(c) \
1440 (isalnum(c) || (c) == '+' || (c) == '/')
1441#define UB64(c) \
1442 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1443 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001444
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001445#define ENCODE(out, ch, bits) \
1446 while (bits >= 6) { \
1447 *out++ = B64(ch >> (bits-6)); \
1448 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001449 }
1450
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001451#define DECODE(out, ch, bits, surrogate) \
1452 while (bits >= 16) { \
1453 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1454 bits -= 16; \
1455 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001456 /* We have already generated an error for the high surrogate \
1457 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001458 surrogate = 0; \
1459 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001460 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001461 it in a 16-bit character */ \
1462 surrogate = 1; \
1463 errmsg = "code pairs are not supported"; \
1464 goto utf7Error; \
1465 } else { \
1466 *out++ = outCh; \
1467 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001468 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001470PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001471 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001472 const char *errors)
1473{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001474 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t startinpos;
1476 Py_ssize_t endinpos;
1477 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478 const char *e;
1479 PyUnicodeObject *unicode;
1480 Py_UNICODE *p;
1481 const char *errmsg = "";
1482 int inShift = 0;
1483 unsigned int bitsleft = 0;
1484 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001485 int surrogate = 0;
1486 PyObject *errorHandler = NULL;
1487 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001488
1489 unicode = _PyUnicode_New(size);
1490 if (!unicode)
1491 return NULL;
1492 if (size == 0)
1493 return (PyObject *)unicode;
1494
1495 p = unicode->str;
1496 e = s + size;
1497
1498 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499 Py_UNICODE ch;
1500 restart:
1501 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001502
1503 if (inShift) {
1504 if ((ch == '-') || !B64CHAR(ch)) {
1505 inShift = 0;
1506 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001507
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1509 if (bitsleft >= 6) {
1510 /* The shift sequence has a partial character in it. If
1511 bitsleft < 6 then we could just classify it as padding
1512 but that is not the case here */
1513
1514 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001515 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001516 }
1517 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001518 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519 here so indicate the potential of a misencoded character. */
1520
1521 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1522 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1523 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001524 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001525 }
1526
1527 if (ch == '-') {
1528 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001529 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001530 inShift = 1;
1531 }
1532 } else if (SPECIAL(ch,0,0)) {
1533 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001534 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535 } else {
1536 *p++ = ch;
1537 }
1538 } else {
1539 charsleft = (charsleft << 6) | UB64(ch);
1540 bitsleft += 6;
1541 s++;
1542 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1543 }
1544 }
1545 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 s++;
1548 if (s < e && *s == '-') {
1549 s++;
1550 *p++ = '+';
1551 } else
1552 {
1553 inShift = 1;
1554 bitsleft = 0;
1555 }
1556 }
1557 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001558 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559 errmsg = "unexpected special character";
1560 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001561 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001562 }
1563 else {
1564 *p++ = ch;
1565 s++;
1566 }
1567 continue;
1568 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001569 outpos = p-PyUnicode_AS_UNICODE(unicode);
1570 endinpos = s-starts;
1571 if (unicode_decode_call_errorhandler(
1572 errors, &errorHandler,
1573 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001574 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001575 (PyObject **)&unicode, &outpos, &p))
1576 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 }
1578
1579 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001580 outpos = p-PyUnicode_AS_UNICODE(unicode);
1581 endinpos = size;
1582 if (unicode_decode_call_errorhandler(
1583 errors, &errorHandler,
1584 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001585 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001586 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001588 if (s < e)
1589 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590 }
1591
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001592 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 goto onError;
1594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001595 Py_XDECREF(errorHandler);
1596 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 return (PyObject *)unicode;
1598
1599onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_XDECREF(errorHandler);
1601 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001602 Py_DECREF(unicode);
1603 return NULL;
1604}
1605
1606
1607PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001608 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001609 int encodeSetO,
1610 int encodeWhiteSpace,
1611 const char *errors)
1612{
1613 PyObject *v;
1614 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001615 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001616 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001617 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001618 unsigned int bitsleft = 0;
1619 unsigned long charsleft = 0;
1620 char * out;
1621 char * start;
1622
1623 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001624 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625
Walter Dörwald51ab4142007-05-05 14:43:36 +00001626 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 if (v == NULL)
1628 return NULL;
1629
Walter Dörwald51ab4142007-05-05 14:43:36 +00001630 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001631 for (;i < size; ++i) {
1632 Py_UNICODE ch = s[i];
1633
1634 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001635 if (ch == '+') {
1636 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637 *out++ = '-';
1638 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1639 charsleft = ch;
1640 bitsleft = 16;
1641 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001642 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001643 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001644 } else {
1645 *out++ = (char) ch;
1646 }
1647 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1649 *out++ = B64(charsleft << (6-bitsleft));
1650 charsleft = 0;
1651 bitsleft = 0;
1652 /* Characters not in the BASE64 set implicitly unshift the sequence
1653 so no '-' is required, except if the character is itself a '-' */
1654 if (B64CHAR(ch) || ch == '-') {
1655 *out++ = '-';
1656 }
1657 inShift = 0;
1658 *out++ = (char) ch;
1659 } else {
1660 bitsleft += 16;
1661 charsleft = (charsleft << 16) | ch;
1662 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1663
1664 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001665 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 or '-' then the shift sequence will be terminated implicitly and we
1667 don't have to insert a '-'. */
1668
1669 if (bitsleft == 0) {
1670 if (i + 1 < size) {
1671 Py_UNICODE ch2 = s[i+1];
1672
1673 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001674
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 } else if (B64CHAR(ch2) || ch2 == '-') {
1676 *out++ = '-';
1677 inShift = 0;
1678 } else {
1679 inShift = 0;
1680 }
1681
1682 }
1683 else {
1684 *out++ = '-';
1685 inShift = 0;
1686 }
1687 }
Tim Petersced69f82003-09-16 20:30:58 +00001688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001690 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 if (bitsleft) {
1692 *out++= B64(charsleft << (6-bitsleft) );
1693 *out++ = '-';
1694 }
1695
Walter Dörwald51ab4142007-05-05 14:43:36 +00001696 if (PyBytes_Resize(v, out - start)) {
1697 Py_DECREF(v);
1698 return NULL;
1699 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001700 return v;
1701}
1702
1703#undef SPECIAL
1704#undef B64
1705#undef B64CHAR
1706#undef UB64
1707#undef ENCODE
1708#undef DECODE
1709
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710/* --- UTF-8 Codec -------------------------------------------------------- */
1711
Tim Petersced69f82003-09-16 20:30:58 +00001712static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713char utf8_code_length[256] = {
1714 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1715 illegal prefix. see RFC 2279 for details */
1716 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1724 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1725 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1726 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1728 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1729 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1730 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1731 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1732};
1733
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001735 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 const char *errors)
1737{
Walter Dörwald69652032004-09-07 20:24:22 +00001738 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1739}
1740
1741PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001742 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001743 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001744 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001748 Py_ssize_t startinpos;
1749 Py_ssize_t endinpos;
1750 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 const char *e;
1752 PyUnicodeObject *unicode;
1753 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001755 PyObject *errorHandler = NULL;
1756 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757
1758 /* Note: size will always be longer than the resulting Unicode
1759 character count */
1760 unicode = _PyUnicode_New(size);
1761 if (!unicode)
1762 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001763 if (size == 0) {
1764 if (consumed)
1765 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768
1769 /* Unpack UTF-8 encoded data */
1770 p = unicode->str;
1771 e = s + size;
1772
1773 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001774 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775
1776 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001777 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 s++;
1779 continue;
1780 }
1781
1782 n = utf8_code_length[ch];
1783
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001784 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001785 if (consumed)
1786 break;
1787 else {
1788 errmsg = "unexpected end of data";
1789 startinpos = s-starts;
1790 endinpos = size;
1791 goto utf8Error;
1792 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794
1795 switch (n) {
1796
1797 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001798 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 startinpos = s-starts;
1800 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001801 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802
1803 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001804 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 startinpos = s-starts;
1806 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808
1809 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001810 if ((s[1] & 0xc0) != 0x80) {
1811 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 startinpos = s-starts;
1813 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001814 goto utf8Error;
1815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
1819 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 errmsg = "illegal encoding";
1821 goto utf8Error;
1822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001824 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 break;
1826
1827 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001828 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 (s[2] & 0xc0) != 0x80) {
1830 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 startinpos = s-starts;
1832 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001833 goto utf8Error;
1834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001836 if (ch < 0x0800) {
1837 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001838 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001839
1840 XXX For wide builds (UCS-4) we should probably try
1841 to recombine the surrogates into a single code
1842 unit.
1843 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 startinpos = s-starts;
1846 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 goto utf8Error;
1848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001850 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001851 break;
1852
1853 case 4:
1854 if ((s[1] & 0xc0) != 0x80 ||
1855 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 (s[3] & 0xc0) != 0x80) {
1857 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 startinpos = s-starts;
1859 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 goto utf8Error;
1861 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001862 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1863 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1864 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001865 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001866 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001867 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001868 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001869 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001870 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001871 startinpos = s-starts;
1872 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001873 goto utf8Error;
1874 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001875#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001876 *p++ = (Py_UNICODE)ch;
1877#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001878 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001879
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001880 /* translate from 10000..10FFFF to 0..FFFF */
1881 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001882
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001883 /* high surrogate = top 10 bits added to D800 */
1884 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001885
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001886 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001887 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001888#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889 break;
1890
1891 default:
1892 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 startinpos = s-starts;
1895 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001896 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 }
1898 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001899 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001900
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001901 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 outpos = p-PyUnicode_AS_UNICODE(unicode);
1903 if (unicode_decode_call_errorhandler(
1904 errors, &errorHandler,
1905 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001906 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 (PyObject **)&unicode, &outpos, &p))
1908 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 }
Walter Dörwald69652032004-09-07 20:24:22 +00001910 if (consumed)
1911 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912
1913 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001914 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 goto onError;
1916
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001917 Py_XDECREF(errorHandler);
1918 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 return (PyObject *)unicode;
1920
1921onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 Py_XDECREF(errorHandler);
1923 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 Py_DECREF(unicode);
1925 return NULL;
1926}
1927
Tim Peters602f7402002-04-27 18:03:26 +00001928/* Allocation strategy: if the string is short, convert into a stack buffer
1929 and allocate exactly as much space needed at the end. Else allocate the
1930 maximum possible needed (4 result bytes per Unicode character), and return
1931 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001932*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001933PyObject *
1934PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001935 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001936 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937{
Tim Peters602f7402002-04-27 18:03:26 +00001938#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001939
Martin v. Löwis18e16552006-02-15 17:27:45 +00001940 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001941 PyObject *v; /* result string object */
1942 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001943 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001944 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001945 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001946
Tim Peters602f7402002-04-27 18:03:26 +00001947 assert(s != NULL);
1948 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949
Tim Peters602f7402002-04-27 18:03:26 +00001950 if (size <= MAX_SHORT_UNICHARS) {
1951 /* Write into the stack buffer; nallocated can't overflow.
1952 * At the end, we'll allocate exactly as much heap space as it
1953 * turns out we need.
1954 */
1955 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1956 v = NULL; /* will allocate after we're done */
1957 p = stackbuf;
1958 }
1959 else {
1960 /* Overallocate on the heap, and give the excess back at the end. */
1961 nallocated = size * 4;
1962 if (nallocated / 4 != size) /* overflow! */
1963 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001964 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001965 if (v == NULL)
1966 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001967 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001968 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001969
Tim Peters602f7402002-04-27 18:03:26 +00001970 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001971 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001972
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001973 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001974 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001976
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001978 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001979 *p++ = (char)(0xc0 | (ch >> 6));
1980 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001981 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001982 else {
Tim Peters602f7402002-04-27 18:03:26 +00001983 /* Encode UCS2 Unicode ordinals */
1984 if (ch < 0x10000) {
1985 /* Special case: check for high surrogate */
1986 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1987 Py_UCS4 ch2 = s[i];
1988 /* Check for low surrogate and combine the two to
1989 form a UCS4 value */
1990 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001991 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001992 i++;
1993 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001994 }
Tim Peters602f7402002-04-27 18:03:26 +00001995 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001996 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001997 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001998 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1999 *p++ = (char)(0x80 | (ch & 0x3f));
2000 continue;
2001 }
2002encodeUCS4:
2003 /* Encode UCS4 Unicode ordinals */
2004 *p++ = (char)(0xf0 | (ch >> 18));
2005 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2006 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2007 *p++ = (char)(0x80 | (ch & 0x3f));
2008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002010
Tim Peters602f7402002-04-27 18:03:26 +00002011 if (v == NULL) {
2012 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002013 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002014 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002015 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002016 }
2017 else {
2018 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002019 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002020 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002021 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002024
Tim Peters602f7402002-04-27 18:03:26 +00002025#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026}
2027
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2029{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 if (!PyUnicode_Check(unicode)) {
2031 PyErr_BadArgument();
2032 return NULL;
2033 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002034 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2035 PyUnicode_GET_SIZE(unicode),
2036 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037}
2038
Walter Dörwald41980ca2007-08-16 21:55:45 +00002039/* --- UTF-32 Codec ------------------------------------------------------- */
2040
2041PyObject *
2042PyUnicode_DecodeUTF32(const char *s,
2043 Py_ssize_t size,
2044 const char *errors,
2045 int *byteorder)
2046{
2047 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2048}
2049
2050PyObject *
2051PyUnicode_DecodeUTF32Stateful(const char *s,
2052 Py_ssize_t size,
2053 const char *errors,
2054 int *byteorder,
2055 Py_ssize_t *consumed)
2056{
2057 const char *starts = s;
2058 Py_ssize_t startinpos;
2059 Py_ssize_t endinpos;
2060 Py_ssize_t outpos;
2061 PyUnicodeObject *unicode;
2062 Py_UNICODE *p;
2063#ifndef Py_UNICODE_WIDE
2064 int i, pairs;
2065#else
2066 const int pairs = 0;
2067#endif
2068 const unsigned char *q, *e;
2069 int bo = 0; /* assume native ordering by default */
2070 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002071 /* Offsets from q for retrieving bytes in the right order. */
2072#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2073 int iorder[] = {0, 1, 2, 3};
2074#else
2075 int iorder[] = {3, 2, 1, 0};
2076#endif
2077 PyObject *errorHandler = NULL;
2078 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002079 /* On narrow builds we split characters outside the BMP into two
2080 codepoints => count how much extra space we need. */
2081#ifndef Py_UNICODE_WIDE
2082 for (i = pairs = 0; i < size/4; i++)
2083 if (((Py_UCS4 *)s)[i] >= 0x10000)
2084 pairs++;
2085#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002086
2087 /* This might be one to much, because of a BOM */
2088 unicode = _PyUnicode_New((size+3)/4+pairs);
2089 if (!unicode)
2090 return NULL;
2091 if (size == 0)
2092 return (PyObject *)unicode;
2093
2094 /* Unpack UTF-32 encoded data */
2095 p = unicode->str;
2096 q = (unsigned char *)s;
2097 e = q + size;
2098
2099 if (byteorder)
2100 bo = *byteorder;
2101
2102 /* Check for BOM marks (U+FEFF) in the input and adjust current
2103 byte order setting accordingly. In native mode, the leading BOM
2104 mark is skipped, in all other modes, it is copied to the output
2105 stream as-is (giving a ZWNBSP character). */
2106 if (bo == 0) {
2107 if (size >= 4) {
2108 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2109 (q[iorder[1]] << 8) | q[iorder[0]];
2110#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2111 if (bom == 0x0000FEFF) {
2112 q += 4;
2113 bo = -1;
2114 }
2115 else if (bom == 0xFFFE0000) {
2116 q += 4;
2117 bo = 1;
2118 }
2119#else
2120 if (bom == 0x0000FEFF) {
2121 q += 4;
2122 bo = 1;
2123 }
2124 else if (bom == 0xFFFE0000) {
2125 q += 4;
2126 bo = -1;
2127 }
2128#endif
2129 }
2130 }
2131
2132 if (bo == -1) {
2133 /* force LE */
2134 iorder[0] = 0;
2135 iorder[1] = 1;
2136 iorder[2] = 2;
2137 iorder[3] = 3;
2138 }
2139 else if (bo == 1) {
2140 /* force BE */
2141 iorder[0] = 3;
2142 iorder[1] = 2;
2143 iorder[2] = 1;
2144 iorder[3] = 0;
2145 }
2146
2147 while (q < e) {
2148 Py_UCS4 ch;
2149 /* remaining bytes at the end? (size should be divisible by 4) */
2150 if (e-q<4) {
2151 if (consumed)
2152 break;
2153 errmsg = "truncated data";
2154 startinpos = ((const char *)q)-starts;
2155 endinpos = ((const char *)e)-starts;
2156 goto utf32Error;
2157 /* The remaining input chars are ignored if the callback
2158 chooses to skip the input */
2159 }
2160 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2161 (q[iorder[1]] << 8) | q[iorder[0]];
2162
2163 if (ch >= 0x110000)
2164 {
2165 errmsg = "codepoint not in range(0x110000)";
2166 startinpos = ((const char *)q)-starts;
2167 endinpos = startinpos+4;
2168 goto utf32Error;
2169 }
2170#ifndef Py_UNICODE_WIDE
2171 if (ch >= 0x10000)
2172 {
2173 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2174 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2175 }
2176 else
2177#endif
2178 *p++ = ch;
2179 q += 4;
2180 continue;
2181 utf32Error:
2182 outpos = p-PyUnicode_AS_UNICODE(unicode);
2183 if (unicode_decode_call_errorhandler(
2184 errors, &errorHandler,
2185 "utf32", errmsg,
2186 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2187 (PyObject **)&unicode, &outpos, &p))
2188 goto onError;
2189 }
2190
2191 if (byteorder)
2192 *byteorder = bo;
2193
2194 if (consumed)
2195 *consumed = (const char *)q-starts;
2196
2197 /* Adjust length */
2198 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2199 goto onError;
2200
2201 Py_XDECREF(errorHandler);
2202 Py_XDECREF(exc);
2203 return (PyObject *)unicode;
2204
2205onError:
2206 Py_DECREF(unicode);
2207 Py_XDECREF(errorHandler);
2208 Py_XDECREF(exc);
2209 return NULL;
2210}
2211
2212PyObject *
2213PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2214 Py_ssize_t size,
2215 const char *errors,
2216 int byteorder)
2217{
2218 PyObject *v;
2219 unsigned char *p;
2220#ifndef Py_UNICODE_WIDE
2221 int i, pairs;
2222#else
2223 const int pairs = 0;
2224#endif
2225 /* Offsets from p for storing byte pairs in the right order. */
2226#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2227 int iorder[] = {0, 1, 2, 3};
2228#else
2229 int iorder[] = {3, 2, 1, 0};
2230#endif
2231
2232#define STORECHAR(CH) \
2233 do { \
2234 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2235 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2236 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2237 p[iorder[0]] = (CH) & 0xff; \
2238 p += 4; \
2239 } while(0)
2240
2241 /* In narrow builds we can output surrogate pairs as one codepoint,
2242 so we need less space. */
2243#ifndef Py_UNICODE_WIDE
2244 for (i = pairs = 0; i < size-1; i++)
2245 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2246 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2247 pairs++;
2248#endif
2249 v = PyBytes_FromStringAndSize(NULL,
2250 4 * (size - pairs + (byteorder == 0)));
2251 if (v == NULL)
2252 return NULL;
2253
2254 p = (unsigned char *)PyBytes_AS_STRING(v);
2255 if (byteorder == 0)
2256 STORECHAR(0xFEFF);
2257 if (size == 0)
2258 return v;
2259
2260 if (byteorder == -1) {
2261 /* force LE */
2262 iorder[0] = 0;
2263 iorder[1] = 1;
2264 iorder[2] = 2;
2265 iorder[3] = 3;
2266 }
2267 else if (byteorder == 1) {
2268 /* force BE */
2269 iorder[0] = 3;
2270 iorder[1] = 2;
2271 iorder[2] = 1;
2272 iorder[3] = 0;
2273 }
2274
2275 while (size-- > 0) {
2276 Py_UCS4 ch = *s++;
2277#ifndef Py_UNICODE_WIDE
2278 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2279 Py_UCS4 ch2 = *s;
2280 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2281 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2282 s++;
2283 size--;
2284 }
2285 }
2286#endif
2287 STORECHAR(ch);
2288 }
2289 return v;
2290#undef STORECHAR
2291}
2292
2293PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2294{
2295 if (!PyUnicode_Check(unicode)) {
2296 PyErr_BadArgument();
2297 return NULL;
2298 }
2299 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2300 PyUnicode_GET_SIZE(unicode),
2301 NULL,
2302 0);
2303}
2304
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305/* --- UTF-16 Codec ------------------------------------------------------- */
2306
Tim Peters772747b2001-08-09 22:21:55 +00002307PyObject *
2308PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002309 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002310 const char *errors,
2311 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312{
Walter Dörwald69652032004-09-07 20:24:22 +00002313 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2314}
2315
2316PyObject *
2317PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002318 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002319 const char *errors,
2320 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002321 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002322{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002323 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002324 Py_ssize_t startinpos;
2325 Py_ssize_t endinpos;
2326 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 PyUnicodeObject *unicode;
2328 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002329 const unsigned char *q, *e;
2330 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002331 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002332 /* Offsets from q for retrieving byte pairs in the right order. */
2333#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2334 int ihi = 1, ilo = 0;
2335#else
2336 int ihi = 0, ilo = 1;
2337#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002338 PyObject *errorHandler = NULL;
2339 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340
2341 /* Note: size will always be longer than the resulting Unicode
2342 character count */
2343 unicode = _PyUnicode_New(size);
2344 if (!unicode)
2345 return NULL;
2346 if (size == 0)
2347 return (PyObject *)unicode;
2348
2349 /* Unpack UTF-16 encoded data */
2350 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002351 q = (unsigned char *)s;
2352 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353
2354 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002355 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002357 /* Check for BOM marks (U+FEFF) in the input and adjust current
2358 byte order setting accordingly. In native mode, the leading BOM
2359 mark is skipped, in all other modes, it is copied to the output
2360 stream as-is (giving a ZWNBSP character). */
2361 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002362 if (size >= 2) {
2363 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002364#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002365 if (bom == 0xFEFF) {
2366 q += 2;
2367 bo = -1;
2368 }
2369 else if (bom == 0xFFFE) {
2370 q += 2;
2371 bo = 1;
2372 }
Tim Petersced69f82003-09-16 20:30:58 +00002373#else
Walter Dörwald69652032004-09-07 20:24:22 +00002374 if (bom == 0xFEFF) {
2375 q += 2;
2376 bo = 1;
2377 }
2378 else if (bom == 0xFFFE) {
2379 q += 2;
2380 bo = -1;
2381 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002382#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002383 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002385
Tim Peters772747b2001-08-09 22:21:55 +00002386 if (bo == -1) {
2387 /* force LE */
2388 ihi = 1;
2389 ilo = 0;
2390 }
2391 else if (bo == 1) {
2392 /* force BE */
2393 ihi = 0;
2394 ilo = 1;
2395 }
2396
2397 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002399 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002400 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002401 if (consumed)
2402 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002403 errmsg = "truncated data";
2404 startinpos = ((const char *)q)-starts;
2405 endinpos = ((const char *)e)-starts;
2406 goto utf16Error;
2407 /* The remaining input chars are ignored if the callback
2408 chooses to skip the input */
2409 }
2410 ch = (q[ihi] << 8) | q[ilo];
2411
Tim Peters772747b2001-08-09 22:21:55 +00002412 q += 2;
2413
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414 if (ch < 0xD800 || ch > 0xDFFF) {
2415 *p++ = ch;
2416 continue;
2417 }
2418
2419 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002420 if (q >= e) {
2421 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002422 startinpos = (((const char *)q)-2)-starts;
2423 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002424 goto utf16Error;
2425 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002426 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002427 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2428 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002429 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002430#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002431 *p++ = ch;
2432 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002433#else
2434 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002435#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002436 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002437 }
2438 else {
2439 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002440 startinpos = (((const char *)q)-4)-starts;
2441 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002442 goto utf16Error;
2443 }
2444
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002446 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002447 startinpos = (((const char *)q)-2)-starts;
2448 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002449 /* Fall through to report the error */
2450
2451 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002452 outpos = p-PyUnicode_AS_UNICODE(unicode);
2453 if (unicode_decode_call_errorhandler(
2454 errors, &errorHandler,
2455 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002456 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002458 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 }
2460
2461 if (byteorder)
2462 *byteorder = bo;
2463
Walter Dörwald69652032004-09-07 20:24:22 +00002464 if (consumed)
2465 *consumed = (const char *)q-starts;
2466
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002468 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 goto onError;
2470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002471 Py_XDECREF(errorHandler);
2472 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473 return (PyObject *)unicode;
2474
2475onError:
2476 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 Py_XDECREF(errorHandler);
2478 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 return NULL;
2480}
2481
Tim Peters772747b2001-08-09 22:21:55 +00002482PyObject *
2483PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002484 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002485 const char *errors,
2486 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487{
2488 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002489 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002490#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002491 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002492#else
2493 const int pairs = 0;
2494#endif
Tim Peters772747b2001-08-09 22:21:55 +00002495 /* Offsets from p for storing byte pairs in the right order. */
2496#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2497 int ihi = 1, ilo = 0;
2498#else
2499 int ihi = 0, ilo = 1;
2500#endif
2501
2502#define STORECHAR(CH) \
2503 do { \
2504 p[ihi] = ((CH) >> 8) & 0xff; \
2505 p[ilo] = (CH) & 0xff; \
2506 p += 2; \
2507 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002509#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002510 for (i = pairs = 0; i < size; i++)
2511 if (s[i] >= 0x10000)
2512 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002513#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002514 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002515 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 if (v == NULL)
2517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518
Walter Dörwald3cc34522007-05-04 10:48:27 +00002519 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002521 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002522 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002523 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002524
2525 if (byteorder == -1) {
2526 /* force LE */
2527 ihi = 1;
2528 ilo = 0;
2529 }
2530 else if (byteorder == 1) {
2531 /* force BE */
2532 ihi = 0;
2533 ilo = 1;
2534 }
2535
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002536 while (size-- > 0) {
2537 Py_UNICODE ch = *s++;
2538 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002539#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002540 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002541 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2542 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002544#endif
Tim Peters772747b2001-08-09 22:21:55 +00002545 STORECHAR(ch);
2546 if (ch2)
2547 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002550#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551}
2552
2553PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2554{
2555 if (!PyUnicode_Check(unicode)) {
2556 PyErr_BadArgument();
2557 return NULL;
2558 }
2559 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2560 PyUnicode_GET_SIZE(unicode),
2561 NULL,
2562 0);
2563}
2564
2565/* --- Unicode Escape Codec ----------------------------------------------- */
2566
Fredrik Lundh06d12682001-01-24 07:59:11 +00002567static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002568
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002570 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 const char *errors)
2572{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002574 Py_ssize_t startinpos;
2575 Py_ssize_t endinpos;
2576 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002581 char* message;
2582 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 PyObject *errorHandler = NULL;
2584 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002585
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586 /* Escaped strings will always be longer than the resulting
2587 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 length after conversion to the true value.
2589 (but if the error callback returns a long replacement string
2590 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 v = _PyUnicode_New(size);
2592 if (v == NULL)
2593 goto onError;
2594 if (size == 0)
2595 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002597 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002599
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 while (s < end) {
2601 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002602 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002603 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604
2605 /* Non-escape characters are interpreted as Unicode ordinals */
2606 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002607 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 continue;
2609 }
2610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002611 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 /* \ - Escapes */
2613 s++;
2614 switch (*s++) {
2615
2616 /* \x escapes */
2617 case '\n': break;
2618 case '\\': *p++ = '\\'; break;
2619 case '\'': *p++ = '\''; break;
2620 case '\"': *p++ = '\"'; break;
2621 case 'b': *p++ = '\b'; break;
2622 case 'f': *p++ = '\014'; break; /* FF */
2623 case 't': *p++ = '\t'; break;
2624 case 'n': *p++ = '\n'; break;
2625 case 'r': *p++ = '\r'; break;
2626 case 'v': *p++ = '\013'; break; /* VT */
2627 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2628
2629 /* \OOO (octal) escapes */
2630 case '0': case '1': case '2': case '3':
2631 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002632 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002634 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002636 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002638 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 break;
2640
Fredrik Lundhccc74732001-02-18 22:13:49 +00002641 /* hex escapes */
2642 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002644 digits = 2;
2645 message = "truncated \\xXX escape";
2646 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647
Fredrik Lundhccc74732001-02-18 22:13:49 +00002648 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002650 digits = 4;
2651 message = "truncated \\uXXXX escape";
2652 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653
Fredrik Lundhccc74732001-02-18 22:13:49 +00002654 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002655 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002656 digits = 8;
2657 message = "truncated \\UXXXXXXXX escape";
2658 hexescape:
2659 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002660 outpos = p-PyUnicode_AS_UNICODE(v);
2661 if (s+digits>end) {
2662 endinpos = size;
2663 if (unicode_decode_call_errorhandler(
2664 errors, &errorHandler,
2665 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002666 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 (PyObject **)&v, &outpos, &p))
2668 goto onError;
2669 goto nextByte;
2670 }
2671 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002672 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002673 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 endinpos = (s+i+1)-starts;
2675 if (unicode_decode_call_errorhandler(
2676 errors, &errorHandler,
2677 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002678 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002680 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002681 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002682 }
2683 chr = (chr<<4) & ~0xF;
2684 if (c >= '0' && c <= '9')
2685 chr += c - '0';
2686 else if (c >= 'a' && c <= 'f')
2687 chr += 10 + c - 'a';
2688 else
2689 chr += 10 + c - 'A';
2690 }
2691 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002692 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 /* _decoding_error will have already written into the
2694 target buffer. */
2695 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002696 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002697 /* when we get here, chr is a 32-bit unicode character */
2698 if (chr <= 0xffff)
2699 /* UCS-2 character */
2700 *p++ = (Py_UNICODE) chr;
2701 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002702 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002703 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002704#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002705 *p++ = chr;
2706#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002707 chr -= 0x10000L;
2708 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002709 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002710#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002711 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 endinpos = s-starts;
2713 outpos = p-PyUnicode_AS_UNICODE(v);
2714 if (unicode_decode_call_errorhandler(
2715 errors, &errorHandler,
2716 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002717 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002719 goto onError;
2720 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002721 break;
2722
2723 /* \N{name} */
2724 case 'N':
2725 message = "malformed \\N character escape";
2726 if (ucnhash_CAPI == NULL) {
2727 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002728 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002729 m = PyImport_ImportModule("unicodedata");
2730 if (m == NULL)
2731 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002732 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002733 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002734 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002735 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002736 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002737 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002738 if (ucnhash_CAPI == NULL)
2739 goto ucnhashError;
2740 }
2741 if (*s == '{') {
2742 const char *start = s+1;
2743 /* look for the closing brace */
2744 while (*s != '}' && s < end)
2745 s++;
2746 if (s > start && s < end && *s == '}') {
2747 /* found a name. look it up in the unicode database */
2748 message = "unknown Unicode character name";
2749 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002750 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002751 goto store;
2752 }
2753 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 endinpos = s-starts;
2755 outpos = p-PyUnicode_AS_UNICODE(v);
2756 if (unicode_decode_call_errorhandler(
2757 errors, &errorHandler,
2758 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002759 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002761 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762 break;
2763
2764 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002765 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 message = "\\ at end of string";
2767 s--;
2768 endinpos = s-starts;
2769 outpos = p-PyUnicode_AS_UNICODE(v);
2770 if (unicode_decode_call_errorhandler(
2771 errors, &errorHandler,
2772 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002773 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002774 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002775 goto onError;
2776 }
2777 else {
2778 *p++ = '\\';
2779 *p++ = (unsigned char)s[-1];
2780 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002781 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002783 nextByte:
2784 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002786 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002788 Py_XDECREF(errorHandler);
2789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002791
Fredrik Lundhccc74732001-02-18 22:13:49 +00002792ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002793 PyErr_SetString(
2794 PyExc_UnicodeError,
2795 "\\N escapes not supported (can't load unicodedata module)"
2796 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002797 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 Py_XDECREF(errorHandler);
2799 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002800 return NULL;
2801
Fredrik Lundhccc74732001-02-18 22:13:49 +00002802onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002804 Py_XDECREF(errorHandler);
2805 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 return NULL;
2807}
2808
2809/* Return a Unicode-Escape string version of the Unicode object.
2810
2811 If quotes is true, the string is enclosed in u"" or u'' quotes as
2812 appropriate.
2813
2814*/
2815
Thomas Wouters477c8d52006-05-27 19:21:47 +00002816Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2817 Py_ssize_t size,
2818 Py_UNICODE ch)
2819{
2820 /* like wcschr, but doesn't stop at NULL characters */
2821
2822 while (size-- > 0) {
2823 if (*s == ch)
2824 return s;
2825 s++;
2826 }
2827
2828 return NULL;
2829}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002830
Walter Dörwald79e913e2007-05-12 11:08:06 +00002831static const char *hexdigits = "0123456789abcdef";
2832
2833PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2834 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835{
2836 PyObject *repr;
2837 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838
Thomas Wouters89f507f2006-12-13 04:49:30 +00002839 /* XXX(nnorwitz): rather than over-allocating, it would be
2840 better to choose a different scheme. Perhaps scan the
2841 first N-chars of the string and allocate based on that size.
2842 */
2843 /* Initial allocation is based on the longest-possible unichr
2844 escape.
2845
2846 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2847 unichr, so in this case it's the longest unichr escape. In
2848 narrow (UTF-16) builds this is five chars per source unichr
2849 since there are two unichrs in the surrogate pair, so in narrow
2850 (UTF-16) builds it's not the longest unichr escape.
2851
2852 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2853 so in the narrow (UTF-16) build case it's the longest unichr
2854 escape.
2855 */
2856
Walter Dörwald79e913e2007-05-12 11:08:06 +00002857 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002858#ifdef Py_UNICODE_WIDE
2859 + 10*size
2860#else
2861 + 6*size
2862#endif
2863 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 if (repr == NULL)
2865 return NULL;
2866
Walter Dörwald79e913e2007-05-12 11:08:06 +00002867 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 while (size-- > 0) {
2870 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002871
Walter Dörwald79e913e2007-05-12 11:08:06 +00002872 /* Escape backslashes */
2873 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 *p++ = '\\';
2875 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002876 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002877 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002878
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002879#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002880 /* Map 21-bit characters to '\U00xxxxxx' */
2881 else if (ch >= 0x10000) {
2882 *p++ = '\\';
2883 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002884 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2885 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2886 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2887 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2888 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2889 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2890 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2891 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002892 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002893 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002894#else
2895 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002896 else if (ch >= 0xD800 && ch < 0xDC00) {
2897 Py_UNICODE ch2;
2898 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002899
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002900 ch2 = *s++;
2901 size--;
2902 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2903 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2904 *p++ = '\\';
2905 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002906 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2907 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2908 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2909 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2910 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2911 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2912 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2913 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002914 continue;
2915 }
2916 /* Fall through: isolated surrogates are copied as-is */
2917 s--;
2918 size++;
2919 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002920#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002921
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002923 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 *p++ = '\\';
2925 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002926 *p++ = hexdigits[(ch >> 12) & 0x000F];
2927 *p++ = hexdigits[(ch >> 8) & 0x000F];
2928 *p++ = hexdigits[(ch >> 4) & 0x000F];
2929 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002931
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002932 /* Map special whitespace to '\t', \n', '\r' */
2933 else if (ch == '\t') {
2934 *p++ = '\\';
2935 *p++ = 't';
2936 }
2937 else if (ch == '\n') {
2938 *p++ = '\\';
2939 *p++ = 'n';
2940 }
2941 else if (ch == '\r') {
2942 *p++ = '\\';
2943 *p++ = 'r';
2944 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002945
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002946 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002947 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002949 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002950 *p++ = hexdigits[(ch >> 4) & 0x000F];
2951 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002952 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002953
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954 /* Copy everything else as-is */
2955 else
2956 *p++ = (char) ch;
2957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958
2959 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002960 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2961 Py_DECREF(repr);
2962 return NULL;
2963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 return repr;
2965}
2966
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2968{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002969 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 if (!PyUnicode_Check(unicode)) {
2971 PyErr_BadArgument();
2972 return NULL;
2973 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002974 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2975 PyUnicode_GET_SIZE(unicode));
2976
2977 if (!s)
2978 return NULL;
2979 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2980 PyBytes_GET_SIZE(s));
2981 Py_DECREF(s);
2982 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983}
2984
2985/* --- Raw Unicode Escape Codec ------------------------------------------- */
2986
2987PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002988 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 const char *errors)
2990{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002991 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002992 Py_ssize_t startinpos;
2993 Py_ssize_t endinpos;
2994 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 const char *end;
2998 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002999 PyObject *errorHandler = NULL;
3000 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003001
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 /* Escaped strings will always be longer than the resulting
3003 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004 length after conversion to the true value. (But decoding error
3005 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 v = _PyUnicode_New(size);
3007 if (v == NULL)
3008 goto onError;
3009 if (size == 0)
3010 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 end = s + size;
3013 while (s < end) {
3014 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003015 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003017 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018
3019 /* Non-escape characters are interpreted as Unicode ordinals */
3020 if (*s != '\\') {
3021 *p++ = (unsigned char)*s++;
3022 continue;
3023 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003024 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025
3026 /* \u-escapes are only interpreted iff the number of leading
3027 backslashes if odd */
3028 bs = s;
3029 for (;s < end;) {
3030 if (*s != '\\')
3031 break;
3032 *p++ = (unsigned char)*s++;
3033 }
3034 if (((s - bs) & 1) == 0 ||
3035 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003036 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 continue;
3038 }
3039 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003040 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 s++;
3042
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003043 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003045 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 endinpos = s-starts;
3049 if (unicode_decode_call_errorhandler(
3050 errors, &errorHandler,
3051 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003052 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 }
3057 x = (x<<4) & ~0xF;
3058 if (c >= '0' && c <= '9')
3059 x += c - '0';
3060 else if (c >= 'a' && c <= 'f')
3061 x += 10 + c - 'a';
3062 else
3063 x += 10 + c - 'A';
3064 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003065#ifndef Py_UNICODE_WIDE
3066 if (x > 0x10000) {
3067 if (unicode_decode_call_errorhandler(
3068 errors, &errorHandler,
3069 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003070 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003071 (PyObject **)&v, &outpos, &p))
3072 goto onError;
3073 }
3074#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 *p++ = x;
3076 nextByte:
3077 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003079 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 Py_XDECREF(errorHandler);
3082 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003084
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 onError:
3086 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003087 Py_XDECREF(errorHandler);
3088 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 return NULL;
3090}
3091
3092PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003093 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094{
3095 PyObject *repr;
3096 char *p;
3097 char *q;
3098
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003099#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003100 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003101#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003102 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003103#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 if (repr == NULL)
3105 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003106 if (size == 0)
3107 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108
Walter Dörwald711005d2007-05-12 12:03:26 +00003109 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 while (size-- > 0) {
3111 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003112#ifdef Py_UNICODE_WIDE
3113 /* Map 32-bit characters to '\Uxxxxxxxx' */
3114 if (ch >= 0x10000) {
3115 *p++ = '\\';
3116 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003117 *p++ = hexdigits[(ch >> 28) & 0xf];
3118 *p++ = hexdigits[(ch >> 24) & 0xf];
3119 *p++ = hexdigits[(ch >> 20) & 0xf];
3120 *p++ = hexdigits[(ch >> 16) & 0xf];
3121 *p++ = hexdigits[(ch >> 12) & 0xf];
3122 *p++ = hexdigits[(ch >> 8) & 0xf];
3123 *p++ = hexdigits[(ch >> 4) & 0xf];
3124 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003125 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003126 else
3127#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 /* Map 16-bit characters to '\uxxxx' */
3129 if (ch >= 256) {
3130 *p++ = '\\';
3131 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003132 *p++ = hexdigits[(ch >> 12) & 0xf];
3133 *p++ = hexdigits[(ch >> 8) & 0xf];
3134 *p++ = hexdigits[(ch >> 4) & 0xf];
3135 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 }
3137 /* Copy everything else as-is */
3138 else
3139 *p++ = (char) ch;
3140 }
3141 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003142 if (PyBytes_Resize(repr, p - q)) {
3143 Py_DECREF(repr);
3144 return NULL;
3145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 return repr;
3147}
3148
3149PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3150{
Walter Dörwald711005d2007-05-12 12:03:26 +00003151 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003153 PyErr_BadArgument();
3154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003156 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3157 PyUnicode_GET_SIZE(unicode));
3158
3159 if (!s)
3160 return NULL;
3161 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3162 PyBytes_GET_SIZE(s));
3163 Py_DECREF(s);
3164 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165}
3166
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003167/* --- Unicode Internal Codec ------------------------------------------- */
3168
3169PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003170 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003171 const char *errors)
3172{
3173 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003174 Py_ssize_t startinpos;
3175 Py_ssize_t endinpos;
3176 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003177 PyUnicodeObject *v;
3178 Py_UNICODE *p;
3179 const char *end;
3180 const char *reason;
3181 PyObject *errorHandler = NULL;
3182 PyObject *exc = NULL;
3183
Neal Norwitzd43069c2006-01-08 01:12:10 +00003184#ifdef Py_UNICODE_WIDE
3185 Py_UNICODE unimax = PyUnicode_GetMax();
3186#endif
3187
Thomas Wouters89f507f2006-12-13 04:49:30 +00003188 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003189 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3190 if (v == NULL)
3191 goto onError;
3192 if (PyUnicode_GetSize((PyObject *)v) == 0)
3193 return (PyObject *)v;
3194 p = PyUnicode_AS_UNICODE(v);
3195 end = s + size;
3196
3197 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003198 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003199 /* We have to sanity check the raw data, otherwise doom looms for
3200 some malformed UCS-4 data. */
3201 if (
3202 #ifdef Py_UNICODE_WIDE
3203 *p > unimax || *p < 0 ||
3204 #endif
3205 end-s < Py_UNICODE_SIZE
3206 )
3207 {
3208 startinpos = s - starts;
3209 if (end-s < Py_UNICODE_SIZE) {
3210 endinpos = end-starts;
3211 reason = "truncated input";
3212 }
3213 else {
3214 endinpos = s - starts + Py_UNICODE_SIZE;
3215 reason = "illegal code point (> 0x10FFFF)";
3216 }
3217 outpos = p - PyUnicode_AS_UNICODE(v);
3218 if (unicode_decode_call_errorhandler(
3219 errors, &errorHandler,
3220 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003221 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003222 (PyObject **)&v, &outpos, &p)) {
3223 goto onError;
3224 }
3225 }
3226 else {
3227 p++;
3228 s += Py_UNICODE_SIZE;
3229 }
3230 }
3231
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003232 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003233 goto onError;
3234 Py_XDECREF(errorHandler);
3235 Py_XDECREF(exc);
3236 return (PyObject *)v;
3237
3238 onError:
3239 Py_XDECREF(v);
3240 Py_XDECREF(errorHandler);
3241 Py_XDECREF(exc);
3242 return NULL;
3243}
3244
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245/* --- Latin-1 Codec ------------------------------------------------------ */
3246
3247PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003248 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 const char *errors)
3250{
3251 PyUnicodeObject *v;
3252 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003253
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003255 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003256 Py_UNICODE r = *(unsigned char*)s;
3257 return PyUnicode_FromUnicode(&r, 1);
3258 }
3259
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 v = _PyUnicode_New(size);
3261 if (v == NULL)
3262 goto onError;
3263 if (size == 0)
3264 return (PyObject *)v;
3265 p = PyUnicode_AS_UNICODE(v);
3266 while (size-- > 0)
3267 *p++ = (unsigned char)*s++;
3268 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003269
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 onError:
3271 Py_XDECREF(v);
3272 return NULL;
3273}
3274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275/* create or adjust a UnicodeEncodeError */
3276static void make_encode_exception(PyObject **exceptionObject,
3277 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003278 const Py_UNICODE *unicode, Py_ssize_t size,
3279 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282 if (*exceptionObject == NULL) {
3283 *exceptionObject = PyUnicodeEncodeError_Create(
3284 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 }
3286 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003287 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3288 goto onError;
3289 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3290 goto onError;
3291 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3292 goto onError;
3293 return;
3294 onError:
3295 Py_DECREF(*exceptionObject);
3296 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 }
3298}
3299
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300/* raises a UnicodeEncodeError */
3301static void raise_encode_exception(PyObject **exceptionObject,
3302 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003303 const Py_UNICODE *unicode, Py_ssize_t size,
3304 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 const char *reason)
3306{
3307 make_encode_exception(exceptionObject,
3308 encoding, unicode, size, startpos, endpos, reason);
3309 if (*exceptionObject != NULL)
3310 PyCodec_StrictErrors(*exceptionObject);
3311}
3312
3313/* error handling callback helper:
3314 build arguments, call the callback and check the arguments,
3315 put the result into newpos and return the replacement string, which
3316 has to be freed by the caller */
3317static PyObject *unicode_encode_call_errorhandler(const char *errors,
3318 PyObject **errorHandler,
3319 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003320 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3321 Py_ssize_t startpos, Py_ssize_t endpos,
3322 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003323{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003324 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003325
3326 PyObject *restuple;
3327 PyObject *resunicode;
3328
3329 if (*errorHandler == NULL) {
3330 *errorHandler = PyCodec_LookupError(errors);
3331 if (*errorHandler == NULL)
3332 return NULL;
3333 }
3334
3335 make_encode_exception(exceptionObject,
3336 encoding, unicode, size, startpos, endpos, reason);
3337 if (*exceptionObject == NULL)
3338 return NULL;
3339
3340 restuple = PyObject_CallFunctionObjArgs(
3341 *errorHandler, *exceptionObject, NULL);
3342 if (restuple == NULL)
3343 return NULL;
3344 if (!PyTuple_Check(restuple)) {
3345 PyErr_Format(PyExc_TypeError, &argparse[4]);
3346 Py_DECREF(restuple);
3347 return NULL;
3348 }
3349 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3350 &resunicode, newpos)) {
3351 Py_DECREF(restuple);
3352 return NULL;
3353 }
3354 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003355 *newpos = size+*newpos;
3356 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003357 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003358 Py_DECREF(restuple);
3359 return NULL;
3360 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003361 Py_INCREF(resunicode);
3362 Py_DECREF(restuple);
3363 return resunicode;
3364}
3365
3366static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003367 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368 const char *errors,
3369 int limit)
3370{
3371 /* output object */
3372 PyObject *res;
3373 /* pointers to the beginning and end+1 of input */
3374 const Py_UNICODE *startp = p;
3375 const Py_UNICODE *endp = p + size;
3376 /* pointer to the beginning of the unencodable characters */
3377 /* const Py_UNICODE *badp = NULL; */
3378 /* pointer into the output */
3379 char *str;
3380 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003381 Py_ssize_t respos = 0;
3382 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003383 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3384 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 PyObject *errorHandler = NULL;
3386 PyObject *exc = NULL;
3387 /* the following variable is used for caching string comparisons
3388 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3389 int known_errorHandler = -1;
3390
3391 /* allocate enough for a simple encoding without
3392 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003393 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394 if (res == NULL)
3395 goto onError;
3396 if (size == 0)
3397 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003398 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003399 ressize = size;
3400
3401 while (p<endp) {
3402 Py_UNICODE c = *p;
3403
3404 /* can we encode this? */
3405 if (c<limit) {
3406 /* no overflow check, because we know that the space is enough */
3407 *str++ = (char)c;
3408 ++p;
3409 }
3410 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003411 Py_ssize_t unicodepos = p-startp;
3412 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003414 Py_ssize_t repsize;
3415 Py_ssize_t newpos;
3416 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 Py_UNICODE *uni2;
3418 /* startpos for collecting unencodable chars */
3419 const Py_UNICODE *collstart = p;
3420 const Py_UNICODE *collend = p;
3421 /* find all unecodable characters */
3422 while ((collend < endp) && ((*collend)>=limit))
3423 ++collend;
3424 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3425 if (known_errorHandler==-1) {
3426 if ((errors==NULL) || (!strcmp(errors, "strict")))
3427 known_errorHandler = 1;
3428 else if (!strcmp(errors, "replace"))
3429 known_errorHandler = 2;
3430 else if (!strcmp(errors, "ignore"))
3431 known_errorHandler = 3;
3432 else if (!strcmp(errors, "xmlcharrefreplace"))
3433 known_errorHandler = 4;
3434 else
3435 known_errorHandler = 0;
3436 }
3437 switch (known_errorHandler) {
3438 case 1: /* strict */
3439 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3440 goto onError;
3441 case 2: /* replace */
3442 while (collstart++<collend)
3443 *str++ = '?'; /* fall through */
3444 case 3: /* ignore */
3445 p = collend;
3446 break;
3447 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003448 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449 /* determine replacement size (temporarily (mis)uses p) */
3450 for (p = collstart, repsize = 0; p < collend; ++p) {
3451 if (*p<10)
3452 repsize += 2+1+1;
3453 else if (*p<100)
3454 repsize += 2+2+1;
3455 else if (*p<1000)
3456 repsize += 2+3+1;
3457 else if (*p<10000)
3458 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003459#ifndef Py_UNICODE_WIDE
3460 else
3461 repsize += 2+5+1;
3462#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 else if (*p<100000)
3464 repsize += 2+5+1;
3465 else if (*p<1000000)
3466 repsize += 2+6+1;
3467 else
3468 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003469#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 }
3471 requiredsize = respos+repsize+(endp-collend);
3472 if (requiredsize > ressize) {
3473 if (requiredsize<2*ressize)
3474 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003475 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003477 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 ressize = requiredsize;
3479 }
3480 /* generate replacement (temporarily (mis)uses p) */
3481 for (p = collstart; p < collend; ++p) {
3482 str += sprintf(str, "&#%d;", (int)*p);
3483 }
3484 p = collend;
3485 break;
3486 default:
3487 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3488 encoding, reason, startp, size, &exc,
3489 collstart-startp, collend-startp, &newpos);
3490 if (repunicode == NULL)
3491 goto onError;
3492 /* need more space? (at least enough for what we
3493 have+the replacement+the rest of the string, so
3494 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003495 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 repsize = PyUnicode_GET_SIZE(repunicode);
3497 requiredsize = respos+repsize+(endp-collend);
3498 if (requiredsize > ressize) {
3499 if (requiredsize<2*ressize)
3500 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003501 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 Py_DECREF(repunicode);
3503 goto onError;
3504 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003505 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 ressize = requiredsize;
3507 }
3508 /* check if there is anything unencodable in the replacement
3509 and copy it to the output */
3510 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3511 c = *uni2;
3512 if (c >= limit) {
3513 raise_encode_exception(&exc, encoding, startp, size,
3514 unicodepos, unicodepos+1, reason);
3515 Py_DECREF(repunicode);
3516 goto onError;
3517 }
3518 *str = (char)c;
3519 }
3520 p = startp + newpos;
3521 Py_DECREF(repunicode);
3522 }
3523 }
3524 }
3525 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003526 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 if (respos<ressize)
3528 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003529 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 Py_XDECREF(errorHandler);
3531 Py_XDECREF(exc);
3532 return res;
3533
3534 onError:
3535 Py_XDECREF(res);
3536 Py_XDECREF(errorHandler);
3537 Py_XDECREF(exc);
3538 return NULL;
3539}
3540
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003542 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 const char *errors)
3544{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546}
3547
3548PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3549{
3550 if (!PyUnicode_Check(unicode)) {
3551 PyErr_BadArgument();
3552 return NULL;
3553 }
3554 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3555 PyUnicode_GET_SIZE(unicode),
3556 NULL);
3557}
3558
3559/* --- 7-bit ASCII Codec -------------------------------------------------- */
3560
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003562 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 const char *errors)
3564{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 PyUnicodeObject *v;
3567 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003568 Py_ssize_t startinpos;
3569 Py_ssize_t endinpos;
3570 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 const char *e;
3572 PyObject *errorHandler = NULL;
3573 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003574
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003576 if (size == 1 && *(unsigned char*)s < 128) {
3577 Py_UNICODE r = *(unsigned char*)s;
3578 return PyUnicode_FromUnicode(&r, 1);
3579 }
Tim Petersced69f82003-09-16 20:30:58 +00003580
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581 v = _PyUnicode_New(size);
3582 if (v == NULL)
3583 goto onError;
3584 if (size == 0)
3585 return (PyObject *)v;
3586 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 e = s + size;
3588 while (s < e) {
3589 register unsigned char c = (unsigned char)*s;
3590 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 ++s;
3593 }
3594 else {
3595 startinpos = s-starts;
3596 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003597 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 if (unicode_decode_call_errorhandler(
3599 errors, &errorHandler,
3600 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003601 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003606 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003607 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003608 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 Py_XDECREF(errorHandler);
3610 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003612
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 onError:
3614 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 Py_XDECREF(errorHandler);
3616 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 return NULL;
3618}
3619
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003621 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 const char *errors)
3623{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625}
3626
3627PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3628{
3629 if (!PyUnicode_Check(unicode)) {
3630 PyErr_BadArgument();
3631 return NULL;
3632 }
3633 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3634 PyUnicode_GET_SIZE(unicode),
3635 NULL);
3636}
3637
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003638#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003639
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003640/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003641
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003642#if SIZEOF_INT < SIZEOF_SSIZE_T
3643#define NEED_RETRY
3644#endif
3645
3646/* XXX This code is limited to "true" double-byte encodings, as
3647 a) it assumes an incomplete character consists of a single byte, and
3648 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3649 encodings, see IsDBCSLeadByteEx documentation. */
3650
3651static int is_dbcs_lead_byte(const char *s, int offset)
3652{
3653 const char *curr = s + offset;
3654
3655 if (IsDBCSLeadByte(*curr)) {
3656 const char *prev = CharPrev(s, curr);
3657 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3658 }
3659 return 0;
3660}
3661
3662/*
3663 * Decode MBCS string into unicode object. If 'final' is set, converts
3664 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3665 */
3666static int decode_mbcs(PyUnicodeObject **v,
3667 const char *s, /* MBCS string */
3668 int size, /* sizeof MBCS string */
3669 int final)
3670{
3671 Py_UNICODE *p;
3672 Py_ssize_t n = 0;
3673 int usize = 0;
3674
3675 assert(size >= 0);
3676
3677 /* Skip trailing lead-byte unless 'final' is set */
3678 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3679 --size;
3680
3681 /* First get the size of the result */
3682 if (size > 0) {
3683 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3684 if (usize == 0) {
3685 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3686 return -1;
3687 }
3688 }
3689
3690 if (*v == NULL) {
3691 /* Create unicode object */
3692 *v = _PyUnicode_New(usize);
3693 if (*v == NULL)
3694 return -1;
3695 }
3696 else {
3697 /* Extend unicode object */
3698 n = PyUnicode_GET_SIZE(*v);
3699 if (_PyUnicode_Resize(v, n + usize) < 0)
3700 return -1;
3701 }
3702
3703 /* Do the conversion */
3704 if (size > 0) {
3705 p = PyUnicode_AS_UNICODE(*v) + n;
3706 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3707 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3708 return -1;
3709 }
3710 }
3711
3712 return size;
3713}
3714
3715PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3716 Py_ssize_t size,
3717 const char *errors,
3718 Py_ssize_t *consumed)
3719{
3720 PyUnicodeObject *v = NULL;
3721 int done;
3722
3723 if (consumed)
3724 *consumed = 0;
3725
3726#ifdef NEED_RETRY
3727 retry:
3728 if (size > INT_MAX)
3729 done = decode_mbcs(&v, s, INT_MAX, 0);
3730 else
3731#endif
3732 done = decode_mbcs(&v, s, (int)size, !consumed);
3733
3734 if (done < 0) {
3735 Py_XDECREF(v);
3736 return NULL;
3737 }
3738
3739 if (consumed)
3740 *consumed += done;
3741
3742#ifdef NEED_RETRY
3743 if (size > INT_MAX) {
3744 s += done;
3745 size -= done;
3746 goto retry;
3747 }
3748#endif
3749
3750 return (PyObject *)v;
3751}
3752
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003753PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003754 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003755 const char *errors)
3756{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003757 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3758}
3759
3760/*
3761 * Convert unicode into string object (MBCS).
3762 * Returns 0 if succeed, -1 otherwise.
3763 */
3764static int encode_mbcs(PyObject **repr,
3765 const Py_UNICODE *p, /* unicode */
3766 int size) /* size of unicode */
3767{
3768 int mbcssize = 0;
3769 Py_ssize_t n = 0;
3770
3771 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003772
3773 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003774 if (size > 0) {
3775 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3776 if (mbcssize == 0) {
3777 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3778 return -1;
3779 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003780 }
3781
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003782 if (*repr == NULL) {
3783 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003784 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003785 if (*repr == NULL)
3786 return -1;
3787 }
3788 else {
3789 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003790 n = PyBytes_Size(*repr);
3791 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003792 return -1;
3793 }
3794
3795 /* Do the conversion */
3796 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003797 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003798 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3799 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3800 return -1;
3801 }
3802 }
3803
3804 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003805}
3806
3807PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003808 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003809 const char *errors)
3810{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003811 PyObject *repr = NULL;
3812 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003813
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003814#ifdef NEED_RETRY
3815 retry:
3816 if (size > INT_MAX)
3817 ret = encode_mbcs(&repr, p, INT_MAX);
3818 else
3819#endif
3820 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003821
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003822 if (ret < 0) {
3823 Py_XDECREF(repr);
3824 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003825 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003826
3827#ifdef NEED_RETRY
3828 if (size > INT_MAX) {
3829 p += INT_MAX;
3830 size -= INT_MAX;
3831 goto retry;
3832 }
3833#endif
3834
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003835 return repr;
3836}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003837
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003838PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3839{
3840 if (!PyUnicode_Check(unicode)) {
3841 PyErr_BadArgument();
3842 return NULL;
3843 }
3844 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3845 PyUnicode_GET_SIZE(unicode),
3846 NULL);
3847}
3848
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003849#undef NEED_RETRY
3850
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003851#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003852
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853/* --- Character Mapping Codec -------------------------------------------- */
3854
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003856 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857 PyObject *mapping,
3858 const char *errors)
3859{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003861 Py_ssize_t startinpos;
3862 Py_ssize_t endinpos;
3863 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 PyUnicodeObject *v;
3866 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003867 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 PyObject *errorHandler = NULL;
3869 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003870 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003871 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003872
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873 /* Default to Latin-1 */
3874 if (mapping == NULL)
3875 return PyUnicode_DecodeLatin1(s, size, errors);
3876
3877 v = _PyUnicode_New(size);
3878 if (v == NULL)
3879 goto onError;
3880 if (size == 0)
3881 return (PyObject *)v;
3882 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003884 if (PyUnicode_CheckExact(mapping)) {
3885 mapstring = PyUnicode_AS_UNICODE(mapping);
3886 maplen = PyUnicode_GET_SIZE(mapping);
3887 while (s < e) {
3888 unsigned char ch = *s;
3889 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003891 if (ch < maplen)
3892 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003894 if (x == 0xfffe) {
3895 /* undefined mapping */
3896 outpos = p-PyUnicode_AS_UNICODE(v);
3897 startinpos = s-starts;
3898 endinpos = startinpos+1;
3899 if (unicode_decode_call_errorhandler(
3900 errors, &errorHandler,
3901 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003902 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003903 (PyObject **)&v, &outpos, &p)) {
3904 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003905 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003906 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003907 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003908 *p++ = x;
3909 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003911 }
3912 else {
3913 while (s < e) {
3914 unsigned char ch = *s;
3915 PyObject *w, *x;
3916
3917 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3918 w = PyInt_FromLong((long)ch);
3919 if (w == NULL)
3920 goto onError;
3921 x = PyObject_GetItem(mapping, w);
3922 Py_DECREF(w);
3923 if (x == NULL) {
3924 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3925 /* No mapping found means: mapping is undefined. */
3926 PyErr_Clear();
3927 x = Py_None;
3928 Py_INCREF(x);
3929 } else
3930 goto onError;
3931 }
3932
3933 /* Apply mapping */
3934 if (PyInt_Check(x)) {
3935 long value = PyInt_AS_LONG(x);
3936 if (value < 0 || value > 65535) {
3937 PyErr_SetString(PyExc_TypeError,
3938 "character mapping must be in range(65536)");
3939 Py_DECREF(x);
3940 goto onError;
3941 }
3942 *p++ = (Py_UNICODE)value;
3943 }
3944 else if (x == Py_None) {
3945 /* undefined mapping */
3946 outpos = p-PyUnicode_AS_UNICODE(v);
3947 startinpos = s-starts;
3948 endinpos = startinpos+1;
3949 if (unicode_decode_call_errorhandler(
3950 errors, &errorHandler,
3951 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003952 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003953 (PyObject **)&v, &outpos, &p)) {
3954 Py_DECREF(x);
3955 goto onError;
3956 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003957 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003958 continue;
3959 }
3960 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003961 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003962
3963 if (targetsize == 1)
3964 /* 1-1 mapping */
3965 *p++ = *PyUnicode_AS_UNICODE(x);
3966
3967 else if (targetsize > 1) {
3968 /* 1-n mapping */
3969 if (targetsize > extrachars) {
3970 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003971 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3972 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003973 (targetsize << 2);
3974 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003975 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003976 if (_PyUnicode_Resize(&v,
3977 PyUnicode_GET_SIZE(v) + needed) < 0) {
3978 Py_DECREF(x);
3979 goto onError;
3980 }
3981 p = PyUnicode_AS_UNICODE(v) + oldpos;
3982 }
3983 Py_UNICODE_COPY(p,
3984 PyUnicode_AS_UNICODE(x),
3985 targetsize);
3986 p += targetsize;
3987 extrachars -= targetsize;
3988 }
3989 /* 1-0 mapping: skip the character */
3990 }
3991 else {
3992 /* wrong return value */
3993 PyErr_SetString(PyExc_TypeError,
3994 "character mapping must return integer, None or unicode");
3995 Py_DECREF(x);
3996 goto onError;
3997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003999 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 }
4002 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004003 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 Py_XDECREF(errorHandler);
4006 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004008
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010 Py_XDECREF(errorHandler);
4011 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 Py_XDECREF(v);
4013 return NULL;
4014}
4015
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004016/* Charmap encoding: the lookup table */
4017
4018struct encoding_map{
4019 PyObject_HEAD
4020 unsigned char level1[32];
4021 int count2, count3;
4022 unsigned char level23[1];
4023};
4024
4025static PyObject*
4026encoding_map_size(PyObject *obj, PyObject* args)
4027{
4028 struct encoding_map *map = (struct encoding_map*)obj;
4029 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4030 128*map->count3);
4031}
4032
4033static PyMethodDef encoding_map_methods[] = {
4034 {"size", encoding_map_size, METH_NOARGS,
4035 PyDoc_STR("Return the size (in bytes) of this object") },
4036 { 0 }
4037};
4038
4039static void
4040encoding_map_dealloc(PyObject* o)
4041{
4042 PyObject_FREE(o);
4043}
4044
4045static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004046 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004047 "EncodingMap", /*tp_name*/
4048 sizeof(struct encoding_map), /*tp_basicsize*/
4049 0, /*tp_itemsize*/
4050 /* methods */
4051 encoding_map_dealloc, /*tp_dealloc*/
4052 0, /*tp_print*/
4053 0, /*tp_getattr*/
4054 0, /*tp_setattr*/
4055 0, /*tp_compare*/
4056 0, /*tp_repr*/
4057 0, /*tp_as_number*/
4058 0, /*tp_as_sequence*/
4059 0, /*tp_as_mapping*/
4060 0, /*tp_hash*/
4061 0, /*tp_call*/
4062 0, /*tp_str*/
4063 0, /*tp_getattro*/
4064 0, /*tp_setattro*/
4065 0, /*tp_as_buffer*/
4066 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4067 0, /*tp_doc*/
4068 0, /*tp_traverse*/
4069 0, /*tp_clear*/
4070 0, /*tp_richcompare*/
4071 0, /*tp_weaklistoffset*/
4072 0, /*tp_iter*/
4073 0, /*tp_iternext*/
4074 encoding_map_methods, /*tp_methods*/
4075 0, /*tp_members*/
4076 0, /*tp_getset*/
4077 0, /*tp_base*/
4078 0, /*tp_dict*/
4079 0, /*tp_descr_get*/
4080 0, /*tp_descr_set*/
4081 0, /*tp_dictoffset*/
4082 0, /*tp_init*/
4083 0, /*tp_alloc*/
4084 0, /*tp_new*/
4085 0, /*tp_free*/
4086 0, /*tp_is_gc*/
4087};
4088
4089PyObject*
4090PyUnicode_BuildEncodingMap(PyObject* string)
4091{
4092 Py_UNICODE *decode;
4093 PyObject *result;
4094 struct encoding_map *mresult;
4095 int i;
4096 int need_dict = 0;
4097 unsigned char level1[32];
4098 unsigned char level2[512];
4099 unsigned char *mlevel1, *mlevel2, *mlevel3;
4100 int count2 = 0, count3 = 0;
4101
4102 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4103 PyErr_BadArgument();
4104 return NULL;
4105 }
4106 decode = PyUnicode_AS_UNICODE(string);
4107 memset(level1, 0xFF, sizeof level1);
4108 memset(level2, 0xFF, sizeof level2);
4109
4110 /* If there isn't a one-to-one mapping of NULL to \0,
4111 or if there are non-BMP characters, we need to use
4112 a mapping dictionary. */
4113 if (decode[0] != 0)
4114 need_dict = 1;
4115 for (i = 1; i < 256; i++) {
4116 int l1, l2;
4117 if (decode[i] == 0
4118 #ifdef Py_UNICODE_WIDE
4119 || decode[i] > 0xFFFF
4120 #endif
4121 ) {
4122 need_dict = 1;
4123 break;
4124 }
4125 if (decode[i] == 0xFFFE)
4126 /* unmapped character */
4127 continue;
4128 l1 = decode[i] >> 11;
4129 l2 = decode[i] >> 7;
4130 if (level1[l1] == 0xFF)
4131 level1[l1] = count2++;
4132 if (level2[l2] == 0xFF)
4133 level2[l2] = count3++;
4134 }
4135
4136 if (count2 >= 0xFF || count3 >= 0xFF)
4137 need_dict = 1;
4138
4139 if (need_dict) {
4140 PyObject *result = PyDict_New();
4141 PyObject *key, *value;
4142 if (!result)
4143 return NULL;
4144 for (i = 0; i < 256; i++) {
4145 key = value = NULL;
4146 key = PyInt_FromLong(decode[i]);
4147 value = PyInt_FromLong(i);
4148 if (!key || !value)
4149 goto failed1;
4150 if (PyDict_SetItem(result, key, value) == -1)
4151 goto failed1;
4152 Py_DECREF(key);
4153 Py_DECREF(value);
4154 }
4155 return result;
4156 failed1:
4157 Py_XDECREF(key);
4158 Py_XDECREF(value);
4159 Py_DECREF(result);
4160 return NULL;
4161 }
4162
4163 /* Create a three-level trie */
4164 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4165 16*count2 + 128*count3 - 1);
4166 if (!result)
4167 return PyErr_NoMemory();
4168 PyObject_Init(result, &EncodingMapType);
4169 mresult = (struct encoding_map*)result;
4170 mresult->count2 = count2;
4171 mresult->count3 = count3;
4172 mlevel1 = mresult->level1;
4173 mlevel2 = mresult->level23;
4174 mlevel3 = mresult->level23 + 16*count2;
4175 memcpy(mlevel1, level1, 32);
4176 memset(mlevel2, 0xFF, 16*count2);
4177 memset(mlevel3, 0, 128*count3);
4178 count3 = 0;
4179 for (i = 1; i < 256; i++) {
4180 int o1, o2, o3, i2, i3;
4181 if (decode[i] == 0xFFFE)
4182 /* unmapped character */
4183 continue;
4184 o1 = decode[i]>>11;
4185 o2 = (decode[i]>>7) & 0xF;
4186 i2 = 16*mlevel1[o1] + o2;
4187 if (mlevel2[i2] == 0xFF)
4188 mlevel2[i2] = count3++;
4189 o3 = decode[i] & 0x7F;
4190 i3 = 128*mlevel2[i2] + o3;
4191 mlevel3[i3] = i;
4192 }
4193 return result;
4194}
4195
4196static int
4197encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4198{
4199 struct encoding_map *map = (struct encoding_map*)mapping;
4200 int l1 = c>>11;
4201 int l2 = (c>>7) & 0xF;
4202 int l3 = c & 0x7F;
4203 int i;
4204
4205#ifdef Py_UNICODE_WIDE
4206 if (c > 0xFFFF) {
4207 return -1;
4208 }
4209#endif
4210 if (c == 0)
4211 return 0;
4212 /* level 1*/
4213 i = map->level1[l1];
4214 if (i == 0xFF) {
4215 return -1;
4216 }
4217 /* level 2*/
4218 i = map->level23[16*i+l2];
4219 if (i == 0xFF) {
4220 return -1;
4221 }
4222 /* level 3 */
4223 i = map->level23[16*map->count2 + 128*i + l3];
4224 if (i == 0) {
4225 return -1;
4226 }
4227 return i;
4228}
4229
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230/* Lookup the character ch in the mapping. If the character
4231 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004232 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 PyObject *w = PyInt_FromLong((long)c);
4236 PyObject *x;
4237
4238 if (w == NULL)
4239 return NULL;
4240 x = PyObject_GetItem(mapping, w);
4241 Py_DECREF(w);
4242 if (x == NULL) {
4243 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4244 /* No mapping found means: mapping is undefined. */
4245 PyErr_Clear();
4246 x = Py_None;
4247 Py_INCREF(x);
4248 return x;
4249 } else
4250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004252 else if (x == Py_None)
4253 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 else if (PyInt_Check(x)) {
4255 long value = PyInt_AS_LONG(x);
4256 if (value < 0 || value > 255) {
4257 PyErr_SetString(PyExc_TypeError,
4258 "character mapping must be in range(256)");
4259 Py_DECREF(x);
4260 return NULL;
4261 }
4262 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 else if (PyString_Check(x))
4265 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004268 PyErr_Format(PyExc_TypeError,
4269 "character mapping must return integer, None or str8, not %.400s",
4270 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 Py_DECREF(x);
4272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 }
4274}
4275
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004276static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004277charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004278{
Walter Dörwald827b0552007-05-12 13:23:53 +00004279 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004280 /* exponentially overallocate to minimize reallocations */
4281 if (requiredsize < 2*outsize)
4282 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004283 if (PyBytes_Resize(outobj, requiredsize)) {
4284 Py_DECREF(outobj);
4285 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004286 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004287 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004288}
4289
4290typedef enum charmapencode_result {
4291 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4292}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004294 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 space is available. Return a new reference to the object that
4296 was put in the output buffer, or Py_None, if the mapping was undefined
4297 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004298 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004300charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004301 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004303 PyObject *rep;
4304 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004305 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004307 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004308 int res = encoding_map_lookup(c, mapping);
4309 Py_ssize_t requiredsize = *outpos+1;
4310 if (res == -1)
4311 return enc_FAILED;
4312 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004313 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004314 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004315 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004316 outstart[(*outpos)++] = (char)res;
4317 return enc_SUCCESS;
4318 }
4319
4320 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004322 return enc_EXCEPTION;
4323 else if (rep==Py_None) {
4324 Py_DECREF(rep);
4325 return enc_FAILED;
4326 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004328 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004329 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004330 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004332 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004334 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4336 }
4337 else {
4338 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004339 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4340 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004342 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004343 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004344 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004346 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 memcpy(outstart + *outpos, repchars, repsize);
4348 *outpos += repsize;
4349 }
4350 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004351 Py_DECREF(rep);
4352 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353}
4354
4355/* handle an error in PyUnicode_EncodeCharmap
4356 Return 0 on success, -1 on error */
4357static
4358int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004359 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004361 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004362 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363{
4364 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004365 Py_ssize_t repsize;
4366 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 Py_UNICODE *uni2;
4368 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004369 Py_ssize_t collstartpos = *inpos;
4370 Py_ssize_t collendpos = *inpos+1;
4371 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372 char *encoding = "charmap";
4373 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004374 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 /* find all unencodable characters */
4377 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004378 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004379 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004380 int res = encoding_map_lookup(p[collendpos], mapping);
4381 if (res != -1)
4382 break;
4383 ++collendpos;
4384 continue;
4385 }
4386
4387 rep = charmapencode_lookup(p[collendpos], mapping);
4388 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004390 else if (rep!=Py_None) {
4391 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392 break;
4393 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004394 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 ++collendpos;
4396 }
4397 /* cache callback name lookup
4398 * (if not done yet, i.e. it's the first error) */
4399 if (*known_errorHandler==-1) {
4400 if ((errors==NULL) || (!strcmp(errors, "strict")))
4401 *known_errorHandler = 1;
4402 else if (!strcmp(errors, "replace"))
4403 *known_errorHandler = 2;
4404 else if (!strcmp(errors, "ignore"))
4405 *known_errorHandler = 3;
4406 else if (!strcmp(errors, "xmlcharrefreplace"))
4407 *known_errorHandler = 4;
4408 else
4409 *known_errorHandler = 0;
4410 }
4411 switch (*known_errorHandler) {
4412 case 1: /* strict */
4413 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4414 return -1;
4415 case 2: /* replace */
4416 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4417 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004418 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 return -1;
4420 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004421 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4423 return -1;
4424 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 }
4426 /* fall through */
4427 case 3: /* ignore */
4428 *inpos = collendpos;
4429 break;
4430 case 4: /* xmlcharrefreplace */
4431 /* generate replacement (temporarily (mis)uses p) */
4432 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4433 char buffer[2+29+1+1];
4434 char *cp;
4435 sprintf(buffer, "&#%d;", (int)p[collpos]);
4436 for (cp = buffer; *cp; ++cp) {
4437 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004438 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004440 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4442 return -1;
4443 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 }
4445 }
4446 *inpos = collendpos;
4447 break;
4448 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004449 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 encoding, reason, p, size, exceptionObject,
4451 collstartpos, collendpos, &newpos);
4452 if (repunicode == NULL)
4453 return -1;
4454 /* generate replacement */
4455 repsize = PyUnicode_GET_SIZE(repunicode);
4456 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4457 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004458 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 return -1;
4460 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004461 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4464 return -1;
4465 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466 }
4467 *inpos = newpos;
4468 Py_DECREF(repunicode);
4469 }
4470 return 0;
4471}
4472
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004474 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 PyObject *mapping,
4476 const char *errors)
4477{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 /* output object */
4479 PyObject *res = NULL;
4480 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004481 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004483 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484 PyObject *errorHandler = NULL;
4485 PyObject *exc = NULL;
4486 /* the following variable is used for caching string comparisons
4487 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4488 * 3=ignore, 4=xmlcharrefreplace */
4489 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490
4491 /* Default to Latin-1 */
4492 if (mapping == NULL)
4493 return PyUnicode_EncodeLatin1(p, size, errors);
4494
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 /* allocate enough for a simple encoding without
4496 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004497 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 if (res == NULL)
4499 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004500 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 while (inpos<size) {
4504 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004505 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004506 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004508 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 if (charmap_encoding_error(p, size, &inpos, mapping,
4510 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004511 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004512 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004513 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 else
4517 /* done with this character => adjust input position */
4518 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004522 if (respos<PyBytes_GET_SIZE(res)) {
4523 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 goto onError;
4525 }
4526 Py_XDECREF(exc);
4527 Py_XDECREF(errorHandler);
4528 return res;
4529
4530 onError:
4531 Py_XDECREF(res);
4532 Py_XDECREF(exc);
4533 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 return NULL;
4535}
4536
4537PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4538 PyObject *mapping)
4539{
4540 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4541 PyErr_BadArgument();
4542 return NULL;
4543 }
4544 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4545 PyUnicode_GET_SIZE(unicode),
4546 mapping,
4547 NULL);
4548}
4549
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550/* create or adjust a UnicodeTranslateError */
4551static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004552 const Py_UNICODE *unicode, Py_ssize_t size,
4553 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556 if (*exceptionObject == NULL) {
4557 *exceptionObject = PyUnicodeTranslateError_Create(
4558 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559 }
4560 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4562 goto onError;
4563 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4564 goto onError;
4565 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4566 goto onError;
4567 return;
4568 onError:
4569 Py_DECREF(*exceptionObject);
4570 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 }
4572}
4573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574/* raises a UnicodeTranslateError */
4575static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004576 const Py_UNICODE *unicode, Py_ssize_t size,
4577 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 const char *reason)
4579{
4580 make_translate_exception(exceptionObject,
4581 unicode, size, startpos, endpos, reason);
4582 if (*exceptionObject != NULL)
4583 PyCodec_StrictErrors(*exceptionObject);
4584}
4585
4586/* error handling callback helper:
4587 build arguments, call the callback and check the arguments,
4588 put the result into newpos and return the replacement string, which
4589 has to be freed by the caller */
4590static PyObject *unicode_translate_call_errorhandler(const char *errors,
4591 PyObject **errorHandler,
4592 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004593 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4594 Py_ssize_t startpos, Py_ssize_t endpos,
4595 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004597 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004599 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 PyObject *restuple;
4601 PyObject *resunicode;
4602
4603 if (*errorHandler == NULL) {
4604 *errorHandler = PyCodec_LookupError(errors);
4605 if (*errorHandler == NULL)
4606 return NULL;
4607 }
4608
4609 make_translate_exception(exceptionObject,
4610 unicode, size, startpos, endpos, reason);
4611 if (*exceptionObject == NULL)
4612 return NULL;
4613
4614 restuple = PyObject_CallFunctionObjArgs(
4615 *errorHandler, *exceptionObject, NULL);
4616 if (restuple == NULL)
4617 return NULL;
4618 if (!PyTuple_Check(restuple)) {
4619 PyErr_Format(PyExc_TypeError, &argparse[4]);
4620 Py_DECREF(restuple);
4621 return NULL;
4622 }
4623 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004624 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 Py_DECREF(restuple);
4626 return NULL;
4627 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004628 if (i_newpos<0)
4629 *newpos = size+i_newpos;
4630 else
4631 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004632 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004633 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004634 Py_DECREF(restuple);
4635 return NULL;
4636 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 Py_INCREF(resunicode);
4638 Py_DECREF(restuple);
4639 return resunicode;
4640}
4641
4642/* Lookup the character ch in the mapping and put the result in result,
4643 which must be decrefed by the caller.
4644 Return 0 on success, -1 on error */
4645static
4646int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4647{
4648 PyObject *w = PyInt_FromLong((long)c);
4649 PyObject *x;
4650
4651 if (w == NULL)
4652 return -1;
4653 x = PyObject_GetItem(mapping, w);
4654 Py_DECREF(w);
4655 if (x == NULL) {
4656 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4657 /* No mapping found means: use 1:1 mapping. */
4658 PyErr_Clear();
4659 *result = NULL;
4660 return 0;
4661 } else
4662 return -1;
4663 }
4664 else if (x == Py_None) {
4665 *result = x;
4666 return 0;
4667 }
4668 else if (PyInt_Check(x)) {
4669 long value = PyInt_AS_LONG(x);
4670 long max = PyUnicode_GetMax();
4671 if (value < 0 || value > max) {
4672 PyErr_Format(PyExc_TypeError,
4673 "character mapping must be in range(0x%lx)", max+1);
4674 Py_DECREF(x);
4675 return -1;
4676 }
4677 *result = x;
4678 return 0;
4679 }
4680 else if (PyUnicode_Check(x)) {
4681 *result = x;
4682 return 0;
4683 }
4684 else {
4685 /* wrong return value */
4686 PyErr_SetString(PyExc_TypeError,
4687 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004688 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 return -1;
4690 }
4691}
4692/* ensure that *outobj is at least requiredsize characters long,
4693if not reallocate and adjust various state variables.
4694Return 0 on success, -1 on error */
4695static
Walter Dörwald4894c302003-10-24 14:25:28 +00004696int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004699 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004700 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004702 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004704 if (requiredsize < 2 * oldsize)
4705 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004706 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004707 return -1;
4708 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004709 }
4710 return 0;
4711}
4712/* lookup the character, put the result in the output string and adjust
4713 various state variables. Return a new reference to the object that
4714 was put in the output buffer in *result, or Py_None, if the mapping was
4715 undefined (in which case no character was written).
4716 The called must decref result.
4717 Return 0 on success, -1 on error. */
4718static
Walter Dörwald4894c302003-10-24 14:25:28 +00004719int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004720 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004721 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722{
Walter Dörwald4894c302003-10-24 14:25:28 +00004723 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004724 return -1;
4725 if (*res==NULL) {
4726 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004727 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 }
4729 else if (*res==Py_None)
4730 ;
4731 else if (PyInt_Check(*res)) {
4732 /* no overflow check, because we know that the space is enough */
4733 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4734 }
4735 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004736 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 if (repsize==1) {
4738 /* no overflow check, because we know that the space is enough */
4739 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4740 }
4741 else if (repsize!=0) {
4742 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004743 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004744 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004745 repsize - 1;
4746 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747 return -1;
4748 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4749 *outp += repsize;
4750 }
4751 }
4752 else
4753 return -1;
4754 return 0;
4755}
4756
4757PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004758 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 PyObject *mapping,
4760 const char *errors)
4761{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 /* output object */
4763 PyObject *res = NULL;
4764 /* pointers to the beginning and end+1 of input */
4765 const Py_UNICODE *startp = p;
4766 const Py_UNICODE *endp = p + size;
4767 /* pointer into the output */
4768 Py_UNICODE *str;
4769 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004770 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 char *reason = "character maps to <undefined>";
4772 PyObject *errorHandler = NULL;
4773 PyObject *exc = NULL;
4774 /* the following variable is used for caching string comparisons
4775 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4776 * 3=ignore, 4=xmlcharrefreplace */
4777 int known_errorHandler = -1;
4778
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 if (mapping == NULL) {
4780 PyErr_BadArgument();
4781 return NULL;
4782 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004783
4784 /* allocate enough for a simple 1:1 translation without
4785 replacements, if we need more, we'll resize */
4786 res = PyUnicode_FromUnicode(NULL, size);
4787 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004788 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 return res;
4791 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004793 while (p<endp) {
4794 /* try to encode it */
4795 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004796 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 goto onError;
4799 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004800 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 if (x!=Py_None) /* it worked => adjust input pointer */
4802 ++p;
4803 else { /* untranslatable character */
4804 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004805 Py_ssize_t repsize;
4806 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 Py_UNICODE *uni2;
4808 /* startpos for collecting untranslatable chars */
4809 const Py_UNICODE *collstart = p;
4810 const Py_UNICODE *collend = p+1;
4811 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 /* find all untranslatable characters */
4814 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004815 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 goto onError;
4817 Py_XDECREF(x);
4818 if (x!=Py_None)
4819 break;
4820 ++collend;
4821 }
4822 /* cache callback name lookup
4823 * (if not done yet, i.e. it's the first error) */
4824 if (known_errorHandler==-1) {
4825 if ((errors==NULL) || (!strcmp(errors, "strict")))
4826 known_errorHandler = 1;
4827 else if (!strcmp(errors, "replace"))
4828 known_errorHandler = 2;
4829 else if (!strcmp(errors, "ignore"))
4830 known_errorHandler = 3;
4831 else if (!strcmp(errors, "xmlcharrefreplace"))
4832 known_errorHandler = 4;
4833 else
4834 known_errorHandler = 0;
4835 }
4836 switch (known_errorHandler) {
4837 case 1: /* strict */
4838 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4839 goto onError;
4840 case 2: /* replace */
4841 /* No need to check for space, this is a 1:1 replacement */
4842 for (coll = collstart; coll<collend; ++coll)
4843 *str++ = '?';
4844 /* fall through */
4845 case 3: /* ignore */
4846 p = collend;
4847 break;
4848 case 4: /* xmlcharrefreplace */
4849 /* generate replacement (temporarily (mis)uses p) */
4850 for (p = collstart; p < collend; ++p) {
4851 char buffer[2+29+1+1];
4852 char *cp;
4853 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004854 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4856 goto onError;
4857 for (cp = buffer; *cp; ++cp)
4858 *str++ = *cp;
4859 }
4860 p = collend;
4861 break;
4862 default:
4863 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4864 reason, startp, size, &exc,
4865 collstart-startp, collend-startp, &newpos);
4866 if (repunicode == NULL)
4867 goto onError;
4868 /* generate replacement */
4869 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004870 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4872 Py_DECREF(repunicode);
4873 goto onError;
4874 }
4875 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4876 *str++ = *uni2;
4877 p = startp + newpos;
4878 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 }
4880 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 /* Resize if we allocated to much */
4883 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004884 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004885 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004886 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 }
4888 Py_XDECREF(exc);
4889 Py_XDECREF(errorHandler);
4890 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 onError:
4893 Py_XDECREF(res);
4894 Py_XDECREF(exc);
4895 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 return NULL;
4897}
4898
4899PyObject *PyUnicode_Translate(PyObject *str,
4900 PyObject *mapping,
4901 const char *errors)
4902{
4903 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004904
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 str = PyUnicode_FromObject(str);
4906 if (str == NULL)
4907 goto onError;
4908 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4909 PyUnicode_GET_SIZE(str),
4910 mapping,
4911 errors);
4912 Py_DECREF(str);
4913 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004914
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 onError:
4916 Py_XDECREF(str);
4917 return NULL;
4918}
Tim Petersced69f82003-09-16 20:30:58 +00004919
Guido van Rossum9e896b32000-04-05 20:11:21 +00004920/* --- Decimal Encoder ---------------------------------------------------- */
4921
4922int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004923 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004924 char *output,
4925 const char *errors)
4926{
4927 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 PyObject *errorHandler = NULL;
4929 PyObject *exc = NULL;
4930 const char *encoding = "decimal";
4931 const char *reason = "invalid decimal Unicode string";
4932 /* the following variable is used for caching string comparisons
4933 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4934 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004935
4936 if (output == NULL) {
4937 PyErr_BadArgument();
4938 return -1;
4939 }
4940
4941 p = s;
4942 end = s + length;
4943 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004944 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004945 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004947 Py_ssize_t repsize;
4948 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 Py_UNICODE *uni2;
4950 Py_UNICODE *collstart;
4951 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004952
Guido van Rossum9e896b32000-04-05 20:11:21 +00004953 if (Py_UNICODE_ISSPACE(ch)) {
4954 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004956 continue;
4957 }
4958 decimal = Py_UNICODE_TODECIMAL(ch);
4959 if (decimal >= 0) {
4960 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004962 continue;
4963 }
Guido van Rossumba477042000-04-06 18:18:10 +00004964 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004965 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004967 continue;
4968 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 /* All other characters are considered unencodable */
4970 collstart = p;
4971 collend = p+1;
4972 while (collend < end) {
4973 if ((0 < *collend && *collend < 256) ||
4974 !Py_UNICODE_ISSPACE(*collend) ||
4975 Py_UNICODE_TODECIMAL(*collend))
4976 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004977 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978 /* cache callback name lookup
4979 * (if not done yet, i.e. it's the first error) */
4980 if (known_errorHandler==-1) {
4981 if ((errors==NULL) || (!strcmp(errors, "strict")))
4982 known_errorHandler = 1;
4983 else if (!strcmp(errors, "replace"))
4984 known_errorHandler = 2;
4985 else if (!strcmp(errors, "ignore"))
4986 known_errorHandler = 3;
4987 else if (!strcmp(errors, "xmlcharrefreplace"))
4988 known_errorHandler = 4;
4989 else
4990 known_errorHandler = 0;
4991 }
4992 switch (known_errorHandler) {
4993 case 1: /* strict */
4994 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4995 goto onError;
4996 case 2: /* replace */
4997 for (p = collstart; p < collend; ++p)
4998 *output++ = '?';
4999 /* fall through */
5000 case 3: /* ignore */
5001 p = collend;
5002 break;
5003 case 4: /* xmlcharrefreplace */
5004 /* generate replacement (temporarily (mis)uses p) */
5005 for (p = collstart; p < collend; ++p)
5006 output += sprintf(output, "&#%d;", (int)*p);
5007 p = collend;
5008 break;
5009 default:
5010 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5011 encoding, reason, s, length, &exc,
5012 collstart-s, collend-s, &newpos);
5013 if (repunicode == NULL)
5014 goto onError;
5015 /* generate replacement */
5016 repsize = PyUnicode_GET_SIZE(repunicode);
5017 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5018 Py_UNICODE ch = *uni2;
5019 if (Py_UNICODE_ISSPACE(ch))
5020 *output++ = ' ';
5021 else {
5022 decimal = Py_UNICODE_TODECIMAL(ch);
5023 if (decimal >= 0)
5024 *output++ = '0' + decimal;
5025 else if (0 < ch && ch < 256)
5026 *output++ = (char)ch;
5027 else {
5028 Py_DECREF(repunicode);
5029 raise_encode_exception(&exc, encoding,
5030 s, length, collstart-s, collend-s, reason);
5031 goto onError;
5032 }
5033 }
5034 }
5035 p = s + newpos;
5036 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005037 }
5038 }
5039 /* 0-terminate the output string */
5040 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 Py_XDECREF(exc);
5042 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005043 return 0;
5044
5045 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046 Py_XDECREF(exc);
5047 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005048 return -1;
5049}
5050
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051/* --- Helpers ------------------------------------------------------------ */
5052
Eric Smith8c663262007-08-25 02:26:07 +00005053#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005054
5055#include "stringlib/fastsearch.h"
5056
5057#include "stringlib/count.h"
5058#include "stringlib/find.h"
5059#include "stringlib/partition.h"
5060
5061/* helper macro to fixup start/end slice values */
5062#define FIX_START_END(obj) \
5063 if (start < 0) \
5064 start += (obj)->length; \
5065 if (start < 0) \
5066 start = 0; \
5067 if (end > (obj)->length) \
5068 end = (obj)->length; \
5069 if (end < 0) \
5070 end += (obj)->length; \
5071 if (end < 0) \
5072 end = 0;
5073
Martin v. Löwis18e16552006-02-15 17:27:45 +00005074Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005075 PyObject *substr,
5076 Py_ssize_t start,
5077 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005079 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005080 PyUnicodeObject* str_obj;
5081 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005082
Thomas Wouters477c8d52006-05-27 19:21:47 +00005083 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5084 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005086 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5087 if (!sub_obj) {
5088 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 return -1;
5090 }
Tim Petersced69f82003-09-16 20:30:58 +00005091
Thomas Wouters477c8d52006-05-27 19:21:47 +00005092 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005093
Thomas Wouters477c8d52006-05-27 19:21:47 +00005094 result = stringlib_count(
5095 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5096 );
5097
5098 Py_DECREF(sub_obj);
5099 Py_DECREF(str_obj);
5100
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 return result;
5102}
5103
Martin v. Löwis18e16552006-02-15 17:27:45 +00005104Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005105 PyObject *sub,
5106 Py_ssize_t start,
5107 Py_ssize_t end,
5108 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005110 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005111
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005113 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005114 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005115 sub = PyUnicode_FromObject(sub);
5116 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005117 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005118 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 }
Tim Petersced69f82003-09-16 20:30:58 +00005120
Thomas Wouters477c8d52006-05-27 19:21:47 +00005121 if (direction > 0)
5122 result = stringlib_find_slice(
5123 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5124 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5125 start, end
5126 );
5127 else
5128 result = stringlib_rfind_slice(
5129 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5130 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5131 start, end
5132 );
5133
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005135 Py_DECREF(sub);
5136
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 return result;
5138}
5139
Tim Petersced69f82003-09-16 20:30:58 +00005140static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141int tailmatch(PyUnicodeObject *self,
5142 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 Py_ssize_t start,
5144 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 int direction)
5146{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 if (substring->length == 0)
5148 return 1;
5149
Thomas Wouters477c8d52006-05-27 19:21:47 +00005150 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151
5152 end -= substring->length;
5153 if (end < start)
5154 return 0;
5155
5156 if (direction > 0) {
5157 if (Py_UNICODE_MATCH(self, end, substring))
5158 return 1;
5159 } else {
5160 if (Py_UNICODE_MATCH(self, start, substring))
5161 return 1;
5162 }
5163
5164 return 0;
5165}
5166
Martin v. Löwis18e16552006-02-15 17:27:45 +00005167Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005169 Py_ssize_t start,
5170 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 int direction)
5172{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005173 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005174
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 str = PyUnicode_FromObject(str);
5176 if (str == NULL)
5177 return -1;
5178 substr = PyUnicode_FromObject(substr);
5179 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005180 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 return -1;
5182 }
Tim Petersced69f82003-09-16 20:30:58 +00005183
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 result = tailmatch((PyUnicodeObject *)str,
5185 (PyUnicodeObject *)substr,
5186 start, end, direction);
5187 Py_DECREF(str);
5188 Py_DECREF(substr);
5189 return result;
5190}
5191
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192/* Apply fixfct filter to the Unicode object self and return a
5193 reference to the modified object */
5194
Tim Petersced69f82003-09-16 20:30:58 +00005195static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196PyObject *fixup(PyUnicodeObject *self,
5197 int (*fixfct)(PyUnicodeObject *s))
5198{
5199
5200 PyUnicodeObject *u;
5201
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005202 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 if (u == NULL)
5204 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005205
5206 Py_UNICODE_COPY(u->str, self->str, self->length);
5207
Tim Peters7a29bd52001-09-12 03:03:31 +00005208 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 /* fixfct should return TRUE if it modified the buffer. If
5210 FALSE, return a reference to the original buffer instead
5211 (to save space, not time) */
5212 Py_INCREF(self);
5213 Py_DECREF(u);
5214 return (PyObject*) self;
5215 }
5216 return (PyObject*) u;
5217}
5218
Tim Petersced69f82003-09-16 20:30:58 +00005219static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220int fixupper(PyUnicodeObject *self)
5221{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005222 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 Py_UNICODE *s = self->str;
5224 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005225
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 while (len-- > 0) {
5227 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005228
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 ch = Py_UNICODE_TOUPPER(*s);
5230 if (ch != *s) {
5231 status = 1;
5232 *s = ch;
5233 }
5234 s++;
5235 }
5236
5237 return status;
5238}
5239
Tim Petersced69f82003-09-16 20:30:58 +00005240static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241int fixlower(PyUnicodeObject *self)
5242{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005243 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 Py_UNICODE *s = self->str;
5245 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005246
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 while (len-- > 0) {
5248 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 ch = Py_UNICODE_TOLOWER(*s);
5251 if (ch != *s) {
5252 status = 1;
5253 *s = ch;
5254 }
5255 s++;
5256 }
5257
5258 return status;
5259}
5260
Tim Petersced69f82003-09-16 20:30:58 +00005261static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262int fixswapcase(PyUnicodeObject *self)
5263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005264 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 Py_UNICODE *s = self->str;
5266 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005267
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 while (len-- > 0) {
5269 if (Py_UNICODE_ISUPPER(*s)) {
5270 *s = Py_UNICODE_TOLOWER(*s);
5271 status = 1;
5272 } else if (Py_UNICODE_ISLOWER(*s)) {
5273 *s = Py_UNICODE_TOUPPER(*s);
5274 status = 1;
5275 }
5276 s++;
5277 }
5278
5279 return status;
5280}
5281
Tim Petersced69f82003-09-16 20:30:58 +00005282static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283int fixcapitalize(PyUnicodeObject *self)
5284{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005285 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005286 Py_UNICODE *s = self->str;
5287 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005288
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005289 if (len == 0)
5290 return 0;
5291 if (Py_UNICODE_ISLOWER(*s)) {
5292 *s = Py_UNICODE_TOUPPER(*s);
5293 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005295 s++;
5296 while (--len > 0) {
5297 if (Py_UNICODE_ISUPPER(*s)) {
5298 *s = Py_UNICODE_TOLOWER(*s);
5299 status = 1;
5300 }
5301 s++;
5302 }
5303 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304}
5305
5306static
5307int fixtitle(PyUnicodeObject *self)
5308{
5309 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5310 register Py_UNICODE *e;
5311 int previous_is_cased;
5312
5313 /* Shortcut for single character strings */
5314 if (PyUnicode_GET_SIZE(self) == 1) {
5315 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5316 if (*p != ch) {
5317 *p = ch;
5318 return 1;
5319 }
5320 else
5321 return 0;
5322 }
Tim Petersced69f82003-09-16 20:30:58 +00005323
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 e = p + PyUnicode_GET_SIZE(self);
5325 previous_is_cased = 0;
5326 for (; p < e; p++) {
5327 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005328
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 if (previous_is_cased)
5330 *p = Py_UNICODE_TOLOWER(ch);
5331 else
5332 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005333
5334 if (Py_UNICODE_ISLOWER(ch) ||
5335 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 Py_UNICODE_ISTITLE(ch))
5337 previous_is_cased = 1;
5338 else
5339 previous_is_cased = 0;
5340 }
5341 return 1;
5342}
5343
Tim Peters8ce9f162004-08-27 01:49:32 +00005344PyObject *
5345PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
Tim Peters8ce9f162004-08-27 01:49:32 +00005347 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005348 const Py_UNICODE blank = ' ';
5349 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005350 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005351 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005352 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5353 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005354 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5355 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005356 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005357 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005358 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
Tim Peters05eba1f2004-08-27 21:32:02 +00005360 fseq = PySequence_Fast(seq, "");
5361 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005362 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005363 }
5364
Tim Peters91879ab2004-08-27 22:35:44 +00005365 /* Grrrr. A codec may be invoked to convert str objects to
5366 * Unicode, and so it's possible to call back into Python code
5367 * during PyUnicode_FromObject(), and so it's possible for a sick
5368 * codec to change the size of fseq (if seq is a list). Therefore
5369 * we have to keep refetching the size -- can't assume seqlen
5370 * is invariant.
5371 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005372 seqlen = PySequence_Fast_GET_SIZE(fseq);
5373 /* If empty sequence, return u"". */
5374 if (seqlen == 0) {
5375 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5376 goto Done;
5377 }
5378 /* If singleton sequence with an exact Unicode, return that. */
5379 if (seqlen == 1) {
5380 item = PySequence_Fast_GET_ITEM(fseq, 0);
5381 if (PyUnicode_CheckExact(item)) {
5382 Py_INCREF(item);
5383 res = (PyUnicodeObject *)item;
5384 goto Done;
5385 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005386 }
5387
Tim Peters05eba1f2004-08-27 21:32:02 +00005388 /* At least two items to join, or one that isn't exact Unicode. */
5389 if (seqlen > 1) {
5390 /* Set up sep and seplen -- they're needed. */
5391 if (separator == NULL) {
5392 sep = &blank;
5393 seplen = 1;
5394 }
5395 else {
5396 internal_separator = PyUnicode_FromObject(separator);
5397 if (internal_separator == NULL)
5398 goto onError;
5399 sep = PyUnicode_AS_UNICODE(internal_separator);
5400 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005401 /* In case PyUnicode_FromObject() mutated seq. */
5402 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005403 }
5404 }
5405
5406 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005407 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005408 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005409 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005410 res_p = PyUnicode_AS_UNICODE(res);
5411 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005412
Tim Peters05eba1f2004-08-27 21:32:02 +00005413 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005414 Py_ssize_t itemlen;
5415 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005416
5417 item = PySequence_Fast_GET_ITEM(fseq, i);
5418 /* Convert item to Unicode. */
Guido van Rossumf1044292007-09-27 18:01:22 +00005419 if (!PyString_Check(item) && !PyUnicode_Check(item))
5420 {
5421 if (PyBytes_Check(item))
5422 {
5423 PyErr_Format(PyExc_TypeError,
5424 "sequence item %d: join() will not operate on "
5425 "bytes objects", i);
5426 goto onError;
5427 }
5428 item = PyObject_Unicode(item);
Tim Peters8ce9f162004-08-27 01:49:32 +00005429 }
Guido van Rossumf1044292007-09-27 18:01:22 +00005430 else
5431 item = PyUnicode_FromObject(item);
5432
Tim Peters05eba1f2004-08-27 21:32:02 +00005433 if (item == NULL)
5434 goto onError;
5435 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005436
Tim Peters91879ab2004-08-27 22:35:44 +00005437 /* In case PyUnicode_FromObject() mutated seq. */
5438 seqlen = PySequence_Fast_GET_SIZE(fseq);
5439
Tim Peters8ce9f162004-08-27 01:49:32 +00005440 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005442 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005443 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005444 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005445 if (i < seqlen - 1) {
5446 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005447 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005448 goto Overflow;
5449 }
5450 if (new_res_used > res_alloc) {
5451 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005452 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005453 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005454 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005455 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005456 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005457 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005458 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005460 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005461 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005463
5464 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005465 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005466 res_p += itemlen;
5467 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005468 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005469 res_p += seplen;
5470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005472 res_used = new_res_used;
5473 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005474
Tim Peters05eba1f2004-08-27 21:32:02 +00005475 /* Shrink res to match the used area; this probably can't fail,
5476 * but it's cheap to check.
5477 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005478 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005479 goto onError;
5480
5481 Done:
5482 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005483 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 return (PyObject *)res;
5485
Tim Peters8ce9f162004-08-27 01:49:32 +00005486 Overflow:
5487 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005488 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005489 Py_DECREF(item);
5490 /* fall through */
5491
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005493 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005494 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005495 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 return NULL;
5497}
5498
Tim Petersced69f82003-09-16 20:30:58 +00005499static
5500PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005501 Py_ssize_t left,
5502 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 Py_UNICODE fill)
5504{
5505 PyUnicodeObject *u;
5506
5507 if (left < 0)
5508 left = 0;
5509 if (right < 0)
5510 right = 0;
5511
Tim Peters7a29bd52001-09-12 03:03:31 +00005512 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 Py_INCREF(self);
5514 return self;
5515 }
5516
5517 u = _PyUnicode_New(left + self->length + right);
5518 if (u) {
5519 if (left)
5520 Py_UNICODE_FILL(u->str, fill, left);
5521 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5522 if (right)
5523 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5524 }
5525
5526 return u;
5527}
5528
5529#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005530 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 if (!str) \
5532 goto onError; \
5533 if (PyList_Append(list, str)) { \
5534 Py_DECREF(str); \
5535 goto onError; \
5536 } \
5537 else \
5538 Py_DECREF(str);
5539
5540static
5541PyObject *split_whitespace(PyUnicodeObject *self,
5542 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005543 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005545 register Py_ssize_t i;
5546 register Py_ssize_t j;
5547 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 PyObject *str;
5549
5550 for (i = j = 0; i < len; ) {
5551 /* find a token */
5552 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5553 i++;
5554 j = i;
5555 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5556 i++;
5557 if (j < i) {
5558 if (maxcount-- <= 0)
5559 break;
5560 SPLIT_APPEND(self->str, j, i);
5561 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5562 i++;
5563 j = i;
5564 }
5565 }
5566 if (j < len) {
5567 SPLIT_APPEND(self->str, j, len);
5568 }
5569 return list;
5570
5571 onError:
5572 Py_DECREF(list);
5573 return NULL;
5574}
5575
5576PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005577 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005579 register Py_ssize_t i;
5580 register Py_ssize_t j;
5581 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 PyObject *list;
5583 PyObject *str;
5584 Py_UNICODE *data;
5585
5586 string = PyUnicode_FromObject(string);
5587 if (string == NULL)
5588 return NULL;
5589 data = PyUnicode_AS_UNICODE(string);
5590 len = PyUnicode_GET_SIZE(string);
5591
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 list = PyList_New(0);
5593 if (!list)
5594 goto onError;
5595
5596 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005597 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005598
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005600 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602
5603 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005604 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 if (i < len) {
5606 if (data[i] == '\r' && i + 1 < len &&
5607 data[i+1] == '\n')
5608 i += 2;
5609 else
5610 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005611 if (keepends)
5612 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 }
Guido van Rossum86662912000-04-11 15:38:46 +00005614 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 j = i;
5616 }
5617 if (j < len) {
5618 SPLIT_APPEND(data, j, len);
5619 }
5620
5621 Py_DECREF(string);
5622 return list;
5623
5624 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005625 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 Py_DECREF(string);
5627 return NULL;
5628}
5629
Tim Petersced69f82003-09-16 20:30:58 +00005630static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631PyObject *split_char(PyUnicodeObject *self,
5632 PyObject *list,
5633 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005634 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005636 register Py_ssize_t i;
5637 register Py_ssize_t j;
5638 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 PyObject *str;
5640
5641 for (i = j = 0; i < len; ) {
5642 if (self->str[i] == ch) {
5643 if (maxcount-- <= 0)
5644 break;
5645 SPLIT_APPEND(self->str, j, i);
5646 i = j = i + 1;
5647 } else
5648 i++;
5649 }
5650 if (j <= len) {
5651 SPLIT_APPEND(self->str, j, len);
5652 }
5653 return list;
5654
5655 onError:
5656 Py_DECREF(list);
5657 return NULL;
5658}
5659
Tim Petersced69f82003-09-16 20:30:58 +00005660static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661PyObject *split_substring(PyUnicodeObject *self,
5662 PyObject *list,
5663 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005664 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005666 register Py_ssize_t i;
5667 register Py_ssize_t j;
5668 Py_ssize_t len = self->length;
5669 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 PyObject *str;
5671
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005672 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 if (Py_UNICODE_MATCH(self, i, substring)) {
5674 if (maxcount-- <= 0)
5675 break;
5676 SPLIT_APPEND(self->str, j, i);
5677 i = j = i + sublen;
5678 } else
5679 i++;
5680 }
5681 if (j <= len) {
5682 SPLIT_APPEND(self->str, j, len);
5683 }
5684 return list;
5685
5686 onError:
5687 Py_DECREF(list);
5688 return NULL;
5689}
5690
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005691static
5692PyObject *rsplit_whitespace(PyUnicodeObject *self,
5693 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005694 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005695{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005696 register Py_ssize_t i;
5697 register Py_ssize_t j;
5698 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005699 PyObject *str;
5700
5701 for (i = j = len - 1; i >= 0; ) {
5702 /* find a token */
5703 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5704 i--;
5705 j = i;
5706 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5707 i--;
5708 if (j > i) {
5709 if (maxcount-- <= 0)
5710 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005711 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005712 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5713 i--;
5714 j = i;
5715 }
5716 }
5717 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005718 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005719 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005720 if (PyList_Reverse(list) < 0)
5721 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005722 return list;
5723
5724 onError:
5725 Py_DECREF(list);
5726 return NULL;
5727}
5728
5729static
5730PyObject *rsplit_char(PyUnicodeObject *self,
5731 PyObject *list,
5732 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005733 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005734{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005735 register Py_ssize_t i;
5736 register Py_ssize_t j;
5737 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005738 PyObject *str;
5739
5740 for (i = j = len - 1; i >= 0; ) {
5741 if (self->str[i] == ch) {
5742 if (maxcount-- <= 0)
5743 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005744 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005745 j = i = i - 1;
5746 } else
5747 i--;
5748 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005749 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005750 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005751 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005752 if (PyList_Reverse(list) < 0)
5753 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005754 return list;
5755
5756 onError:
5757 Py_DECREF(list);
5758 return NULL;
5759}
5760
5761static
5762PyObject *rsplit_substring(PyUnicodeObject *self,
5763 PyObject *list,
5764 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005765 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005766{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005767 register Py_ssize_t i;
5768 register Py_ssize_t j;
5769 Py_ssize_t len = self->length;
5770 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771 PyObject *str;
5772
5773 for (i = len - sublen, j = len; i >= 0; ) {
5774 if (Py_UNICODE_MATCH(self, i, substring)) {
5775 if (maxcount-- <= 0)
5776 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005777 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778 j = i;
5779 i -= sublen;
5780 } else
5781 i--;
5782 }
5783 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005784 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005785 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005786 if (PyList_Reverse(list) < 0)
5787 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005788 return list;
5789
5790 onError:
5791 Py_DECREF(list);
5792 return NULL;
5793}
5794
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795#undef SPLIT_APPEND
5796
5797static
5798PyObject *split(PyUnicodeObject *self,
5799 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005800 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801{
5802 PyObject *list;
5803
5804 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005805 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806
5807 list = PyList_New(0);
5808 if (!list)
5809 return NULL;
5810
5811 if (substring == NULL)
5812 return split_whitespace(self,list,maxcount);
5813
5814 else if (substring->length == 1)
5815 return split_char(self,list,substring->str[0],maxcount);
5816
5817 else if (substring->length == 0) {
5818 Py_DECREF(list);
5819 PyErr_SetString(PyExc_ValueError, "empty separator");
5820 return NULL;
5821 }
5822 else
5823 return split_substring(self,list,substring,maxcount);
5824}
5825
Tim Petersced69f82003-09-16 20:30:58 +00005826static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005827PyObject *rsplit(PyUnicodeObject *self,
5828 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005829 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005830{
5831 PyObject *list;
5832
5833 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005834 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005835
5836 list = PyList_New(0);
5837 if (!list)
5838 return NULL;
5839
5840 if (substring == NULL)
5841 return rsplit_whitespace(self,list,maxcount);
5842
5843 else if (substring->length == 1)
5844 return rsplit_char(self,list,substring->str[0],maxcount);
5845
5846 else if (substring->length == 0) {
5847 Py_DECREF(list);
5848 PyErr_SetString(PyExc_ValueError, "empty separator");
5849 return NULL;
5850 }
5851 else
5852 return rsplit_substring(self,list,substring,maxcount);
5853}
5854
5855static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856PyObject *replace(PyUnicodeObject *self,
5857 PyUnicodeObject *str1,
5858 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005859 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860{
5861 PyUnicodeObject *u;
5862
5863 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005864 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
Thomas Wouters477c8d52006-05-27 19:21:47 +00005866 if (str1->length == str2->length) {
5867 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005868 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005869 if (str1->length == 1) {
5870 /* replace characters */
5871 Py_UNICODE u1, u2;
5872 if (!findchar(self->str, self->length, str1->str[0]))
5873 goto nothing;
5874 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5875 if (!u)
5876 return NULL;
5877 Py_UNICODE_COPY(u->str, self->str, self->length);
5878 u1 = str1->str[0];
5879 u2 = str2->str[0];
5880 for (i = 0; i < u->length; i++)
5881 if (u->str[i] == u1) {
5882 if (--maxcount < 0)
5883 break;
5884 u->str[i] = u2;
5885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005887 i = fastsearch(
5888 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005890 if (i < 0)
5891 goto nothing;
5892 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5893 if (!u)
5894 return NULL;
5895 Py_UNICODE_COPY(u->str, self->str, self->length);
5896 while (i <= self->length - str1->length)
5897 if (Py_UNICODE_MATCH(self, i, str1)) {
5898 if (--maxcount < 0)
5899 break;
5900 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5901 i += str1->length;
5902 } else
5903 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005906
5907 Py_ssize_t n, i, j, e;
5908 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 Py_UNICODE *p;
5910
5911 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005912 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 if (n > maxcount)
5914 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005915 if (n == 0)
5916 goto nothing;
5917 /* new_size = self->length + n * (str2->length - str1->length)); */
5918 delta = (str2->length - str1->length);
5919 if (delta == 0) {
5920 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005922 product = n * (str2->length - str1->length);
5923 if ((product / (str2->length - str1->length)) != n) {
5924 PyErr_SetString(PyExc_OverflowError,
5925 "replace string is too long");
5926 return NULL;
5927 }
5928 new_size = self->length + product;
5929 if (new_size < 0) {
5930 PyErr_SetString(PyExc_OverflowError,
5931 "replace string is too long");
5932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 }
5934 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005935 u = _PyUnicode_New(new_size);
5936 if (!u)
5937 return NULL;
5938 i = 0;
5939 p = u->str;
5940 e = self->length - str1->length;
5941 if (str1->length > 0) {
5942 while (n-- > 0) {
5943 /* look for next match */
5944 j = i;
5945 while (j <= e) {
5946 if (Py_UNICODE_MATCH(self, j, str1))
5947 break;
5948 j++;
5949 }
5950 if (j > i) {
5951 if (j > e)
5952 break;
5953 /* copy unchanged part [i:j] */
5954 Py_UNICODE_COPY(p, self->str+i, j-i);
5955 p += j - i;
5956 }
5957 /* copy substitution string */
5958 if (str2->length > 0) {
5959 Py_UNICODE_COPY(p, str2->str, str2->length);
5960 p += str2->length;
5961 }
5962 i = j + str1->length;
5963 }
5964 if (i < self->length)
5965 /* copy tail [i:] */
5966 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5967 } else {
5968 /* interleave */
5969 while (n > 0) {
5970 Py_UNICODE_COPY(p, str2->str, str2->length);
5971 p += str2->length;
5972 if (--n <= 0)
5973 break;
5974 *p++ = self->str[i++];
5975 }
5976 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005980
5981nothing:
5982 /* nothing to replace; return original string (when possible) */
5983 if (PyUnicode_CheckExact(self)) {
5984 Py_INCREF(self);
5985 return (PyObject *) self;
5986 }
5987 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988}
5989
5990/* --- Unicode Object Methods --------------------------------------------- */
5991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005992PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993"S.title() -> unicode\n\
5994\n\
5995Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005996characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
5998static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005999unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 return fixup(self, fixtitle);
6002}
6003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006004PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005"S.capitalize() -> unicode\n\
6006\n\
6007Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006008have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
6010static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006011unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 return fixup(self, fixcapitalize);
6014}
6015
6016#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006017PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018"S.capwords() -> unicode\n\
6019\n\
6020Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006021normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
6023static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006024unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
6026 PyObject *list;
6027 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006028 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 /* Split into words */
6031 list = split(self, NULL, -1);
6032 if (!list)
6033 return NULL;
6034
6035 /* Capitalize each word */
6036 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6037 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6038 fixcapitalize);
6039 if (item == NULL)
6040 goto onError;
6041 Py_DECREF(PyList_GET_ITEM(list, i));
6042 PyList_SET_ITEM(list, i, item);
6043 }
6044
6045 /* Join the words to form a new string */
6046 item = PyUnicode_Join(NULL, list);
6047
6048onError:
6049 Py_DECREF(list);
6050 return (PyObject *)item;
6051}
6052#endif
6053
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006054/* Argument converter. Coerces to a single unicode character */
6055
6056static int
6057convert_uc(PyObject *obj, void *addr)
6058{
6059 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6060 PyObject *uniobj;
6061 Py_UNICODE *unistr;
6062
6063 uniobj = PyUnicode_FromObject(obj);
6064 if (uniobj == NULL) {
6065 PyErr_SetString(PyExc_TypeError,
6066 "The fill character cannot be converted to Unicode");
6067 return 0;
6068 }
6069 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6070 PyErr_SetString(PyExc_TypeError,
6071 "The fill character must be exactly one character long");
6072 Py_DECREF(uniobj);
6073 return 0;
6074 }
6075 unistr = PyUnicode_AS_UNICODE(uniobj);
6076 *fillcharloc = unistr[0];
6077 Py_DECREF(uniobj);
6078 return 1;
6079}
6080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006081PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006082"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006084Return S centered in a Unicode string of length width. Padding is\n\
6085done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086
6087static PyObject *
6088unicode_center(PyUnicodeObject *self, PyObject *args)
6089{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006090 Py_ssize_t marg, left;
6091 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006092 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093
Thomas Woutersde017742006-02-16 19:34:37 +00006094 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 return NULL;
6096
Tim Peters7a29bd52001-09-12 03:03:31 +00006097 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 Py_INCREF(self);
6099 return (PyObject*) self;
6100 }
6101
6102 marg = width - self->length;
6103 left = marg / 2 + (marg & width & 1);
6104
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006105 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106}
6107
Marc-André Lemburge5034372000-08-08 08:04:29 +00006108#if 0
6109
6110/* This code should go into some future Unicode collation support
6111 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006112 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006113
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006114/* speedy UTF-16 code point order comparison */
6115/* gleaned from: */
6116/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6117
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006118static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006119{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006120 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006121 0, 0, 0, 0, 0, 0, 0, 0,
6122 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006123 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006124};
6125
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126static int
6127unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6128{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006130
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 Py_UNICODE *s1 = str1->str;
6132 Py_UNICODE *s2 = str2->str;
6133
6134 len1 = str1->length;
6135 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006136
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006138 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006139
6140 c1 = *s1++;
6141 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006142
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006143 if (c1 > (1<<11) * 26)
6144 c1 += utf16Fixup[c1>>11];
6145 if (c2 > (1<<11) * 26)
6146 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006147 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006148
6149 if (c1 != c2)
6150 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006151
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006152 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
6154
6155 return (len1 < len2) ? -1 : (len1 != len2);
6156}
6157
Marc-André Lemburge5034372000-08-08 08:04:29 +00006158#else
6159
6160static int
6161unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006163 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006164
6165 Py_UNICODE *s1 = str1->str;
6166 Py_UNICODE *s2 = str2->str;
6167
6168 len1 = str1->length;
6169 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006170
Marc-André Lemburge5034372000-08-08 08:04:29 +00006171 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006172 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006173
Fredrik Lundh45714e92001-06-26 16:39:36 +00006174 c1 = *s1++;
6175 c2 = *s2++;
6176
6177 if (c1 != c2)
6178 return (c1 < c2) ? -1 : 1;
6179
Marc-André Lemburge5034372000-08-08 08:04:29 +00006180 len1--; len2--;
6181 }
6182
6183 return (len1 < len2) ? -1 : (len1 != len2);
6184}
6185
6186#endif
6187
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188int PyUnicode_Compare(PyObject *left,
6189 PyObject *right)
6190{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006191 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6192 return unicode_compare((PyUnicodeObject *)left,
6193 (PyUnicodeObject *)right);
6194 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6195 (PyUnicode_Check(left) && PyString_Check(right))) {
6196 if (PyUnicode_Check(left))
6197 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6198 if (PyUnicode_Check(right))
6199 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6200 assert(PyString_Check(left));
6201 assert(PyString_Check(right));
6202 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006204 PyErr_Format(PyExc_TypeError,
6205 "Can't compare %.100s and %.100s",
6206 left->ob_type->tp_name,
6207 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 return -1;
6209}
6210
Martin v. Löwis5b222132007-06-10 09:51:05 +00006211int
6212PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6213{
6214 int i;
6215 Py_UNICODE *id;
6216 assert(PyUnicode_Check(uni));
6217 id = PyUnicode_AS_UNICODE(uni);
6218 /* Compare Unicode string and source character set string */
6219 for (i = 0; id[i] && str[i]; i++)
6220 if (id[i] != str[i])
6221 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6222 if (id[i])
6223 return 1; /* uni is longer */
6224 if (str[i])
6225 return -1; /* str is longer */
6226 return 0;
6227}
6228
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006229PyObject *PyUnicode_RichCompare(PyObject *left,
6230 PyObject *right,
6231 int op)
6232{
6233 int result;
6234
6235 result = PyUnicode_Compare(left, right);
6236 if (result == -1 && PyErr_Occurred())
6237 goto onError;
6238
6239 /* Convert the return value to a Boolean */
6240 switch (op) {
6241 case Py_EQ:
6242 result = (result == 0);
6243 break;
6244 case Py_NE:
6245 result = (result != 0);
6246 break;
6247 case Py_LE:
6248 result = (result <= 0);
6249 break;
6250 case Py_GE:
6251 result = (result >= 0);
6252 break;
6253 case Py_LT:
6254 result = (result == -1);
6255 break;
6256 case Py_GT:
6257 result = (result == 1);
6258 break;
6259 }
6260 return PyBool_FromLong(result);
6261
6262 onError:
6263
6264 /* Standard case
6265
6266 Type errors mean that PyUnicode_FromObject() could not convert
6267 one of the arguments (usually the right hand side) to Unicode,
6268 ie. we can't handle the comparison request. However, it is
6269 possible that the other object knows a comparison method, which
6270 is why we return Py_NotImplemented to give the other object a
6271 chance.
6272
6273 */
6274 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6275 PyErr_Clear();
6276 Py_INCREF(Py_NotImplemented);
6277 return Py_NotImplemented;
6278 }
6279 if (op != Py_EQ && op != Py_NE)
6280 return NULL;
6281
6282 /* Equality comparison.
6283
6284 This is a special case: we silence any PyExc_UnicodeDecodeError
6285 and instead turn it into a PyErr_UnicodeWarning.
6286
6287 */
6288 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6289 return NULL;
6290 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006291 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6292 (op == Py_EQ) ?
6293 "Unicode equal comparison "
6294 "failed to convert both arguments to Unicode - "
6295 "interpreting them as being unequal"
6296 :
6297 "Unicode unequal comparison "
6298 "failed to convert both arguments to Unicode - "
6299 "interpreting them as being unequal",
6300 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006301 return NULL;
6302 result = (op == Py_NE);
6303 return PyBool_FromLong(result);
6304}
6305
Guido van Rossum403d68b2000-03-13 15:55:09 +00006306int PyUnicode_Contains(PyObject *container,
6307 PyObject *element)
6308{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006309 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006310 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006311
6312 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006313 sub = PyUnicode_FromObject(element);
6314 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006315 PyErr_Format(PyExc_TypeError,
6316 "'in <string>' requires string as left operand, not %s",
6317 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006318 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006319 }
6320
Thomas Wouters477c8d52006-05-27 19:21:47 +00006321 str = PyUnicode_FromObject(container);
6322 if (!str) {
6323 Py_DECREF(sub);
6324 return -1;
6325 }
6326
6327 result = stringlib_contains_obj(str, sub);
6328
6329 Py_DECREF(str);
6330 Py_DECREF(sub);
6331
Guido van Rossum403d68b2000-03-13 15:55:09 +00006332 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006333}
6334
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335/* Concat to string or Unicode object giving a new Unicode object. */
6336
6337PyObject *PyUnicode_Concat(PyObject *left,
6338 PyObject *right)
6339{
6340 PyUnicodeObject *u = NULL, *v = NULL, *w;
6341
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006342 if (PyBytes_Check(left) || PyBytes_Check(right))
6343 return PyBytes_Concat(left, right);
6344
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 /* Coerce the two arguments */
6346 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6347 if (u == NULL)
6348 goto onError;
6349 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6350 if (v == NULL)
6351 goto onError;
6352
6353 /* Shortcuts */
6354 if (v == unicode_empty) {
6355 Py_DECREF(v);
6356 return (PyObject *)u;
6357 }
6358 if (u == unicode_empty) {
6359 Py_DECREF(u);
6360 return (PyObject *)v;
6361 }
6362
6363 /* Concat the two Unicode strings */
6364 w = _PyUnicode_New(u->length + v->length);
6365 if (w == NULL)
6366 goto onError;
6367 Py_UNICODE_COPY(w->str, u->str, u->length);
6368 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6369
6370 Py_DECREF(u);
6371 Py_DECREF(v);
6372 return (PyObject *)w;
6373
6374onError:
6375 Py_XDECREF(u);
6376 Py_XDECREF(v);
6377 return NULL;
6378}
6379
Walter Dörwald1ab83302007-05-18 17:15:44 +00006380void
6381PyUnicode_Append(PyObject **pleft, PyObject *right)
6382{
6383 PyObject *new;
6384 if (*pleft == NULL)
6385 return;
6386 if (right == NULL || !PyUnicode_Check(*pleft)) {
6387 Py_DECREF(*pleft);
6388 *pleft = NULL;
6389 return;
6390 }
6391 new = PyUnicode_Concat(*pleft, right);
6392 Py_DECREF(*pleft);
6393 *pleft = new;
6394}
6395
6396void
6397PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6398{
6399 PyUnicode_Append(pleft, right);
6400 Py_XDECREF(right);
6401}
6402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006403PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404"S.count(sub[, start[, end]]) -> int\n\
6405\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006406Return the number of non-overlapping occurrences of substring sub in\n\
6407Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006408interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
6410static PyObject *
6411unicode_count(PyUnicodeObject *self, PyObject *args)
6412{
6413 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006414 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006415 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 PyObject *result;
6417
Guido van Rossumb8872e62000-05-09 14:14:27 +00006418 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6419 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 return NULL;
6421
6422 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006423 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 if (substring == NULL)
6425 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006426
Thomas Wouters477c8d52006-05-27 19:21:47 +00006427 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
Thomas Wouters477c8d52006-05-27 19:21:47 +00006429 result = PyInt_FromSsize_t(
6430 stringlib_count(self->str + start, end - start,
6431 substring->str, substring->length)
6432 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
6434 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006435
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 return result;
6437}
6438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006439PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006440"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006442Encodes S using the codec registered for encoding. encoding defaults\n\
6443to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006444handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6446'xmlcharrefreplace' as well as any other name registered with\n\
6447codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448
6449static PyObject *
6450unicode_encode(PyUnicodeObject *self, PyObject *args)
6451{
6452 char *encoding = NULL;
6453 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006454 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006455
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6457 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006458 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006459 if (v == NULL)
6460 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006461 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006462 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006463 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006464 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006465 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006466 Py_DECREF(v);
6467 return NULL;
6468 }
6469 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006470
6471 onError:
6472 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006473}
6474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006475PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476"S.expandtabs([tabsize]) -> unicode\n\
6477\n\
6478Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006479If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480
6481static PyObject*
6482unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6483{
6484 Py_UNICODE *e;
6485 Py_UNICODE *p;
6486 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006487 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 PyUnicodeObject *u;
6489 int tabsize = 8;
6490
6491 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6492 return NULL;
6493
Thomas Wouters7e474022000-07-16 12:04:32 +00006494 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006495 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 e = self->str + self->length;
6497 for (p = self->str; p < e; p++)
6498 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006499 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006501 if (old_j > j) {
6502 PyErr_SetString(PyExc_OverflowError,
6503 "new string is too long");
6504 return NULL;
6505 }
6506 old_j = j;
6507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 }
6509 else {
6510 j++;
6511 if (*p == '\n' || *p == '\r') {
6512 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006513 old_j = j = 0;
6514 if (i < 0) {
6515 PyErr_SetString(PyExc_OverflowError,
6516 "new string is too long");
6517 return NULL;
6518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 }
6520 }
6521
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006522 if ((i + j) < 0) {
6523 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6524 return NULL;
6525 }
6526
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 /* Second pass: create output string and fill it */
6528 u = _PyUnicode_New(i + j);
6529 if (!u)
6530 return NULL;
6531
6532 j = 0;
6533 q = u->str;
6534
6535 for (p = self->str; p < e; p++)
6536 if (*p == '\t') {
6537 if (tabsize > 0) {
6538 i = tabsize - (j % tabsize);
6539 j += i;
6540 while (i--)
6541 *q++ = ' ';
6542 }
6543 }
6544 else {
6545 j++;
6546 *q++ = *p;
6547 if (*p == '\n' || *p == '\r')
6548 j = 0;
6549 }
6550
6551 return (PyObject*) u;
6552}
6553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006554PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555"S.find(sub [,start [,end]]) -> int\n\
6556\n\
6557Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006558such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559arguments start and end are interpreted as in slice notation.\n\
6560\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006561Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562
6563static PyObject *
6564unicode_find(PyUnicodeObject *self, PyObject *args)
6565{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006566 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006567 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006568 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006569 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570
Guido van Rossumb8872e62000-05-09 14:14:27 +00006571 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6572 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006574 substring = PyUnicode_FromObject(substring);
6575 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 return NULL;
6577
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578 result = stringlib_find_slice(
6579 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6580 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6581 start, end
6582 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583
6584 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006585
6586 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587}
6588
6589static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006590unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591{
6592 if (index < 0 || index >= self->length) {
6593 PyErr_SetString(PyExc_IndexError, "string index out of range");
6594 return NULL;
6595 }
6596
6597 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6598}
6599
Guido van Rossumc2504932007-09-18 19:42:40 +00006600/* Believe it or not, this produces the same value for ASCII strings
6601 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006603unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Guido van Rossumc2504932007-09-18 19:42:40 +00006605 Py_ssize_t len;
6606 Py_UNICODE *p;
6607 long x;
6608
6609 if (self->hash != -1)
6610 return self->hash;
6611 len = Py_Size(self);
6612 p = self->str;
6613 x = *p << 7;
6614 while (--len >= 0)
6615 x = (1000003*x) ^ *p++;
6616 x ^= Py_Size(self);
6617 if (x == -1)
6618 x = -2;
6619 self->hash = x;
6620 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621}
6622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006623PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624"S.index(sub [,start [,end]]) -> int\n\
6625\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006626Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627
6628static PyObject *
6629unicode_index(PyUnicodeObject *self, PyObject *args)
6630{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006631 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006632 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006633 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006634 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635
Guido van Rossumb8872e62000-05-09 14:14:27 +00006636 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6637 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006639 substring = PyUnicode_FromObject(substring);
6640 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 return NULL;
6642
Thomas Wouters477c8d52006-05-27 19:21:47 +00006643 result = stringlib_find_slice(
6644 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6645 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6646 start, end
6647 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648
6649 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 if (result < 0) {
6652 PyErr_SetString(PyExc_ValueError, "substring not found");
6653 return NULL;
6654 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006655
Martin v. Löwis18e16552006-02-15 17:27:45 +00006656 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657}
6658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006659PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006660"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006662Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664
6665static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006666unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667{
6668 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6669 register const Py_UNICODE *e;
6670 int cased;
6671
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 /* Shortcut for single character strings */
6673 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006674 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006676 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006677 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006678 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 e = p + PyUnicode_GET_SIZE(self);
6681 cased = 0;
6682 for (; p < e; p++) {
6683 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006684
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006686 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 else if (!cased && Py_UNICODE_ISLOWER(ch))
6688 cased = 1;
6689 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006690 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691}
6692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006693PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006694"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006696Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006697at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698
6699static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006700unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701{
6702 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6703 register const Py_UNICODE *e;
6704 int cased;
6705
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 /* Shortcut for single character strings */
6707 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006708 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006710 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006711 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006713
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 e = p + PyUnicode_GET_SIZE(self);
6715 cased = 0;
6716 for (; p < e; p++) {
6717 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006718
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006720 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 else if (!cased && Py_UNICODE_ISUPPER(ch))
6722 cased = 1;
6723 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006724 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725}
6726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006727PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006728"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006730Return True if S is a titlecased string and there is at least one\n\
6731character in S, i.e. upper- and titlecase characters may only\n\
6732follow uncased characters and lowercase characters only cased ones.\n\
6733Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
6735static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006736unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737{
6738 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6739 register const Py_UNICODE *e;
6740 int cased, previous_is_cased;
6741
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 /* Shortcut for single character strings */
6743 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006744 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6745 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006747 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006748 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006749 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006750
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 e = p + PyUnicode_GET_SIZE(self);
6752 cased = 0;
6753 previous_is_cased = 0;
6754 for (; p < e; p++) {
6755 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006756
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6758 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006759 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 previous_is_cased = 1;
6761 cased = 1;
6762 }
6763 else if (Py_UNICODE_ISLOWER(ch)) {
6764 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006765 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 previous_is_cased = 1;
6767 cased = 1;
6768 }
6769 else
6770 previous_is_cased = 0;
6771 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006772 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773}
6774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006775PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006776"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006778Return True if all characters in S are whitespace\n\
6779and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
6781static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006782unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783{
6784 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6785 register const Py_UNICODE *e;
6786
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 /* Shortcut for single character strings */
6788 if (PyUnicode_GET_SIZE(self) == 1 &&
6789 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006790 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006792 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006793 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006794 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006795
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796 e = p + PyUnicode_GET_SIZE(self);
6797 for (; p < e; p++) {
6798 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006799 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006801 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802}
6803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006804PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006805"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006806\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006807Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006808and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006809
6810static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006811unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006812{
6813 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6814 register const Py_UNICODE *e;
6815
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006816 /* Shortcut for single character strings */
6817 if (PyUnicode_GET_SIZE(self) == 1 &&
6818 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006819 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006820
6821 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006822 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006823 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006824
6825 e = p + PyUnicode_GET_SIZE(self);
6826 for (; p < e; p++) {
6827 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006828 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006829 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006830 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006831}
6832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006833PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006834"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006835\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006836Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006837and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006838
6839static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006840unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006841{
6842 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6843 register const Py_UNICODE *e;
6844
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006845 /* Shortcut for single character strings */
6846 if (PyUnicode_GET_SIZE(self) == 1 &&
6847 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006848 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006849
6850 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006851 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006852 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006853
6854 e = p + PyUnicode_GET_SIZE(self);
6855 for (; p < e; p++) {
6856 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006857 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006858 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006859 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006860}
6861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006862PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006863"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006865Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006866False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867
6868static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006869unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870{
6871 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6872 register const Py_UNICODE *e;
6873
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 /* Shortcut for single character strings */
6875 if (PyUnicode_GET_SIZE(self) == 1 &&
6876 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006877 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006879 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006880 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006881 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006882
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 e = p + PyUnicode_GET_SIZE(self);
6884 for (; p < e; p++) {
6885 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006886 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006888 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889}
6890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006891PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006894Return True if all characters in S are digits\n\
6895and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
6897static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006898unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899{
6900 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6901 register const Py_UNICODE *e;
6902
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 /* Shortcut for single character strings */
6904 if (PyUnicode_GET_SIZE(self) == 1 &&
6905 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006908 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006909 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006911
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 e = p + PyUnicode_GET_SIZE(self);
6913 for (; p < e; p++) {
6914 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006915 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006917 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918}
6919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006920PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006921"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006923Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006924False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
6926static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006927unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928{
6929 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6930 register const Py_UNICODE *e;
6931
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 /* Shortcut for single character strings */
6933 if (PyUnicode_GET_SIZE(self) == 1 &&
6934 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006935 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006937 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006938 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006940
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 e = p + PyUnicode_GET_SIZE(self);
6942 for (; p < e; p++) {
6943 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006946 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947}
6948
Martin v. Löwis47383402007-08-15 07:32:56 +00006949int
6950PyUnicode_IsIdentifier(PyObject *self)
6951{
6952 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6953 register const Py_UNICODE *e;
6954
6955 /* Special case for empty strings */
6956 if (PyUnicode_GET_SIZE(self) == 0)
6957 return 0;
6958
6959 /* PEP 3131 says that the first character must be in
6960 XID_Start and subsequent characters in XID_Continue,
6961 and for the ASCII range, the 2.x rules apply (i.e
6962 start with letters and underscore, continue with
6963 letters, digits, underscore). However, given the current
6964 definition of XID_Start and XID_Continue, it is sufficient
6965 to check just for these, except that _ must be allowed
6966 as starting an identifier. */
6967 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6968 return 0;
6969
6970 e = p + PyUnicode_GET_SIZE(self);
6971 for (p++; p < e; p++) {
6972 if (!_PyUnicode_IsXidContinue(*p))
6973 return 0;
6974 }
6975 return 1;
6976}
6977
6978PyDoc_STRVAR(isidentifier__doc__,
6979"S.isidentifier() -> bool\n\
6980\n\
6981Return True if S is a valid identifier according\n\
6982to the language definition.");
6983
6984static PyObject*
6985unicode_isidentifier(PyObject *self)
6986{
6987 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6988}
6989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006990PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991"S.join(sequence) -> unicode\n\
6992\n\
6993Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006994sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995
6996static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006997unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006999 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000}
7001
Martin v. Löwis18e16552006-02-15 17:27:45 +00007002static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003unicode_length(PyUnicodeObject *self)
7004{
7005 return self->length;
7006}
7007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007008PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007009"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010\n\
7011Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007012done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013
7014static PyObject *
7015unicode_ljust(PyUnicodeObject *self, PyObject *args)
7016{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007017 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007018 Py_UNICODE fillchar = ' ';
7019
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007020 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 return NULL;
7022
Tim Peters7a29bd52001-09-12 03:03:31 +00007023 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 Py_INCREF(self);
7025 return (PyObject*) self;
7026 }
7027
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007028 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029}
7030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007031PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032"S.lower() -> unicode\n\
7033\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007034Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035
7036static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007037unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 return fixup(self, fixlower);
7040}
7041
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007042#define LEFTSTRIP 0
7043#define RIGHTSTRIP 1
7044#define BOTHSTRIP 2
7045
7046/* Arrays indexed by above */
7047static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7048
7049#define STRIPNAME(i) (stripformat[i]+3)
7050
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007051/* externally visible for str.strip(unicode) */
7052PyObject *
7053_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7054{
7055 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007056 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007057 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007058 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7059 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060
Thomas Wouters477c8d52006-05-27 19:21:47 +00007061 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7062
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007063 i = 0;
7064 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007065 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7066 i++;
7067 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007068 }
7069
7070 j = len;
7071 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007072 do {
7073 j--;
7074 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7075 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076 }
7077
7078 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007079 Py_INCREF(self);
7080 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007081 }
7082 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007083 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007084}
7085
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086
7087static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007088do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007091 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007092
7093 i = 0;
7094 if (striptype != RIGHTSTRIP) {
7095 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7096 i++;
7097 }
7098 }
7099
7100 j = len;
7101 if (striptype != LEFTSTRIP) {
7102 do {
7103 j--;
7104 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7105 j++;
7106 }
7107
7108 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7109 Py_INCREF(self);
7110 return (PyObject*)self;
7111 }
7112 else
7113 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114}
7115
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116
7117static PyObject *
7118do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7119{
7120 PyObject *sep = NULL;
7121
7122 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7123 return NULL;
7124
7125 if (sep != NULL && sep != Py_None) {
7126 if (PyUnicode_Check(sep))
7127 return _PyUnicode_XStrip(self, striptype, sep);
7128 else if (PyString_Check(sep)) {
7129 PyObject *res;
7130 sep = PyUnicode_FromObject(sep);
7131 if (sep==NULL)
7132 return NULL;
7133 res = _PyUnicode_XStrip(self, striptype, sep);
7134 Py_DECREF(sep);
7135 return res;
7136 }
7137 else {
7138 PyErr_Format(PyExc_TypeError,
7139 "%s arg must be None, unicode or str",
7140 STRIPNAME(striptype));
7141 return NULL;
7142 }
7143 }
7144
7145 return do_strip(self, striptype);
7146}
7147
7148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007149PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007150"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007151\n\
7152Return a copy of the string S with leading and trailing\n\
7153whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007154If chars is given and not None, remove characters in chars instead.\n\
7155If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007156
7157static PyObject *
7158unicode_strip(PyUnicodeObject *self, PyObject *args)
7159{
7160 if (PyTuple_GET_SIZE(args) == 0)
7161 return do_strip(self, BOTHSTRIP); /* Common case */
7162 else
7163 return do_argstrip(self, BOTHSTRIP, args);
7164}
7165
7166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007167PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007168"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007169\n\
7170Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007171If chars is given and not None, remove characters in chars instead.\n\
7172If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007173
7174static PyObject *
7175unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7176{
7177 if (PyTuple_GET_SIZE(args) == 0)
7178 return do_strip(self, LEFTSTRIP); /* Common case */
7179 else
7180 return do_argstrip(self, LEFTSTRIP, args);
7181}
7182
7183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007184PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007185"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007186\n\
7187Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007188If chars is given and not None, remove characters in chars instead.\n\
7189If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007190
7191static PyObject *
7192unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7193{
7194 if (PyTuple_GET_SIZE(args) == 0)
7195 return do_strip(self, RIGHTSTRIP); /* Common case */
7196 else
7197 return do_argstrip(self, RIGHTSTRIP, args);
7198}
7199
7200
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007202unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203{
7204 PyUnicodeObject *u;
7205 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007206 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007207 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208
7209 if (len < 0)
7210 len = 0;
7211
Tim Peters7a29bd52001-09-12 03:03:31 +00007212 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 /* no repeat, return original string */
7214 Py_INCREF(str);
7215 return (PyObject*) str;
7216 }
Tim Peters8f422462000-09-09 06:13:41 +00007217
7218 /* ensure # of chars needed doesn't overflow int and # of bytes
7219 * needed doesn't overflow size_t
7220 */
7221 nchars = len * str->length;
7222 if (len && nchars / len != str->length) {
7223 PyErr_SetString(PyExc_OverflowError,
7224 "repeated string is too long");
7225 return NULL;
7226 }
7227 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7228 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7229 PyErr_SetString(PyExc_OverflowError,
7230 "repeated string is too long");
7231 return NULL;
7232 }
7233 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234 if (!u)
7235 return NULL;
7236
7237 p = u->str;
7238
Thomas Wouters477c8d52006-05-27 19:21:47 +00007239 if (str->length == 1 && len > 0) {
7240 Py_UNICODE_FILL(p, str->str[0], len);
7241 } else {
7242 Py_ssize_t done = 0; /* number of characters copied this far */
7243 if (done < nchars) {
7244 Py_UNICODE_COPY(p, str->str, str->length);
7245 done = str->length;
7246 }
7247 while (done < nchars) {
7248 int n = (done <= nchars-done) ? done : nchars-done;
7249 Py_UNICODE_COPY(p+done, p, n);
7250 done += n;
7251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 }
7253
7254 return (PyObject*) u;
7255}
7256
7257PyObject *PyUnicode_Replace(PyObject *obj,
7258 PyObject *subobj,
7259 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007260 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261{
7262 PyObject *self;
7263 PyObject *str1;
7264 PyObject *str2;
7265 PyObject *result;
7266
7267 self = PyUnicode_FromObject(obj);
7268 if (self == NULL)
7269 return NULL;
7270 str1 = PyUnicode_FromObject(subobj);
7271 if (str1 == NULL) {
7272 Py_DECREF(self);
7273 return NULL;
7274 }
7275 str2 = PyUnicode_FromObject(replobj);
7276 if (str2 == NULL) {
7277 Py_DECREF(self);
7278 Py_DECREF(str1);
7279 return NULL;
7280 }
Tim Petersced69f82003-09-16 20:30:58 +00007281 result = replace((PyUnicodeObject *)self,
7282 (PyUnicodeObject *)str1,
7283 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 maxcount);
7285 Py_DECREF(self);
7286 Py_DECREF(str1);
7287 Py_DECREF(str2);
7288 return result;
7289}
7290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007291PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292"S.replace (old, new[, maxsplit]) -> unicode\n\
7293\n\
7294Return a copy of S with all occurrences of substring\n\
7295old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007296given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298static PyObject*
7299unicode_replace(PyUnicodeObject *self, PyObject *args)
7300{
7301 PyUnicodeObject *str1;
7302 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007303 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 PyObject *result;
7305
Martin v. Löwis18e16552006-02-15 17:27:45 +00007306 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 return NULL;
7308 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7309 if (str1 == NULL)
7310 return NULL;
7311 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007312 if (str2 == NULL) {
7313 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316
7317 result = replace(self, str1, str2, maxcount);
7318
7319 Py_DECREF(str1);
7320 Py_DECREF(str2);
7321 return result;
7322}
7323
7324static
7325PyObject *unicode_repr(PyObject *unicode)
7326{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007327 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007328 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007329 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7330 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7331
7332 /* XXX(nnorwitz): rather than over-allocating, it would be
7333 better to choose a different scheme. Perhaps scan the
7334 first N-chars of the string and allocate based on that size.
7335 */
7336 /* Initial allocation is based on the longest-possible unichr
7337 escape.
7338
7339 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7340 unichr, so in this case it's the longest unichr escape. In
7341 narrow (UTF-16) builds this is five chars per source unichr
7342 since there are two unichrs in the surrogate pair, so in narrow
7343 (UTF-16) builds it's not the longest unichr escape.
7344
7345 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7346 so in the narrow (UTF-16) build case it's the longest unichr
7347 escape.
7348 */
7349
Walter Dörwald1ab83302007-05-18 17:15:44 +00007350 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007351 2 /* quotes */
7352#ifdef Py_UNICODE_WIDE
7353 + 10*size
7354#else
7355 + 6*size
7356#endif
7357 + 1);
7358 if (repr == NULL)
7359 return NULL;
7360
Walter Dörwald1ab83302007-05-18 17:15:44 +00007361 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007362
7363 /* Add quote */
7364 *p++ = (findchar(s, size, '\'') &&
7365 !findchar(s, size, '"')) ? '"' : '\'';
7366 while (size-- > 0) {
7367 Py_UNICODE ch = *s++;
7368
7369 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007370 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007371 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007372 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007373 continue;
7374 }
7375
7376#ifdef Py_UNICODE_WIDE
7377 /* Map 21-bit characters to '\U00xxxxxx' */
7378 else if (ch >= 0x10000) {
7379 *p++ = '\\';
7380 *p++ = 'U';
7381 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7382 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7383 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7384 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7385 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7386 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7387 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7388 *p++ = hexdigits[ch & 0x0000000F];
7389 continue;
7390 }
7391#else
7392 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7393 else if (ch >= 0xD800 && ch < 0xDC00) {
7394 Py_UNICODE ch2;
7395 Py_UCS4 ucs;
7396
7397 ch2 = *s++;
7398 size--;
7399 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7400 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7401 *p++ = '\\';
7402 *p++ = 'U';
7403 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7404 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7405 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7406 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7407 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7408 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7409 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7410 *p++ = hexdigits[ucs & 0x0000000F];
7411 continue;
7412 }
7413 /* Fall through: isolated surrogates are copied as-is */
7414 s--;
7415 size++;
7416 }
7417#endif
7418
7419 /* Map 16-bit characters to '\uxxxx' */
7420 if (ch >= 256) {
7421 *p++ = '\\';
7422 *p++ = 'u';
7423 *p++ = hexdigits[(ch >> 12) & 0x000F];
7424 *p++ = hexdigits[(ch >> 8) & 0x000F];
7425 *p++ = hexdigits[(ch >> 4) & 0x000F];
7426 *p++ = hexdigits[ch & 0x000F];
7427 }
7428
7429 /* Map special whitespace to '\t', \n', '\r' */
7430 else if (ch == '\t') {
7431 *p++ = '\\';
7432 *p++ = 't';
7433 }
7434 else if (ch == '\n') {
7435 *p++ = '\\';
7436 *p++ = 'n';
7437 }
7438 else if (ch == '\r') {
7439 *p++ = '\\';
7440 *p++ = 'r';
7441 }
7442
7443 /* Map non-printable US ASCII to '\xhh' */
7444 else if (ch < ' ' || ch >= 0x7F) {
7445 *p++ = '\\';
7446 *p++ = 'x';
7447 *p++ = hexdigits[(ch >> 4) & 0x000F];
7448 *p++ = hexdigits[ch & 0x000F];
7449 }
7450
7451 /* Copy everything else as-is */
7452 else
7453 *p++ = (char) ch;
7454 }
7455 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007456 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007457
7458 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007459 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007460 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461}
7462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007463PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464"S.rfind(sub [,start [,end]]) -> int\n\
7465\n\
7466Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007467such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468arguments start and end are interpreted as in slice notation.\n\
7469\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
7472static PyObject *
7473unicode_rfind(PyUnicodeObject *self, PyObject *args)
7474{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007475 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007476 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007477 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007478 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
Guido van Rossumb8872e62000-05-09 14:14:27 +00007480 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7481 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007483 substring = PyUnicode_FromObject(substring);
7484 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 return NULL;
7486
Thomas Wouters477c8d52006-05-27 19:21:47 +00007487 result = stringlib_rfind_slice(
7488 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7489 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7490 start, end
7491 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492
7493 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007494
7495 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496}
7497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007498PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499"S.rindex(sub [,start [,end]]) -> int\n\
7500\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007501Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502
7503static PyObject *
7504unicode_rindex(PyUnicodeObject *self, PyObject *args)
7505{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007506 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007507 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007508 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007509 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510
Guido van Rossumb8872e62000-05-09 14:14:27 +00007511 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7512 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007514 substring = PyUnicode_FromObject(substring);
7515 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 return NULL;
7517
Thomas Wouters477c8d52006-05-27 19:21:47 +00007518 result = stringlib_rfind_slice(
7519 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7520 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7521 start, end
7522 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523
7524 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007525
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 if (result < 0) {
7527 PyErr_SetString(PyExc_ValueError, "substring not found");
7528 return NULL;
7529 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007530 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531}
7532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007533PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007534"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535\n\
7536Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007537done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538
7539static PyObject *
7540unicode_rjust(PyUnicodeObject *self, PyObject *args)
7541{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007542 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007543 Py_UNICODE fillchar = ' ';
7544
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007545 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 return NULL;
7547
Tim Peters7a29bd52001-09-12 03:03:31 +00007548 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 Py_INCREF(self);
7550 return (PyObject*) self;
7551 }
7552
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007553 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554}
7555
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556PyObject *PyUnicode_Split(PyObject *s,
7557 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007558 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559{
7560 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007561
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 s = PyUnicode_FromObject(s);
7563 if (s == NULL)
7564 return NULL;
7565 if (sep != NULL) {
7566 sep = PyUnicode_FromObject(sep);
7567 if (sep == NULL) {
7568 Py_DECREF(s);
7569 return NULL;
7570 }
7571 }
7572
7573 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7574
7575 Py_DECREF(s);
7576 Py_XDECREF(sep);
7577 return result;
7578}
7579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007580PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581"S.split([sep [,maxsplit]]) -> list of strings\n\
7582\n\
7583Return a list of the words in S, using sep as the\n\
7584delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007585splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007586any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
7588static PyObject*
7589unicode_split(PyUnicodeObject *self, PyObject *args)
7590{
7591 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007592 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593
Martin v. Löwis18e16552006-02-15 17:27:45 +00007594 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 return NULL;
7596
7597 if (substring == Py_None)
7598 return split(self, NULL, maxcount);
7599 else if (PyUnicode_Check(substring))
7600 return split(self, (PyUnicodeObject *)substring, maxcount);
7601 else
7602 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7603}
7604
Thomas Wouters477c8d52006-05-27 19:21:47 +00007605PyObject *
7606PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7607{
7608 PyObject* str_obj;
7609 PyObject* sep_obj;
7610 PyObject* out;
7611
7612 str_obj = PyUnicode_FromObject(str_in);
7613 if (!str_obj)
7614 return NULL;
7615 sep_obj = PyUnicode_FromObject(sep_in);
7616 if (!sep_obj) {
7617 Py_DECREF(str_obj);
7618 return NULL;
7619 }
7620
7621 out = stringlib_partition(
7622 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7623 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7624 );
7625
7626 Py_DECREF(sep_obj);
7627 Py_DECREF(str_obj);
7628
7629 return out;
7630}
7631
7632
7633PyObject *
7634PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7635{
7636 PyObject* str_obj;
7637 PyObject* sep_obj;
7638 PyObject* out;
7639
7640 str_obj = PyUnicode_FromObject(str_in);
7641 if (!str_obj)
7642 return NULL;
7643 sep_obj = PyUnicode_FromObject(sep_in);
7644 if (!sep_obj) {
7645 Py_DECREF(str_obj);
7646 return NULL;
7647 }
7648
7649 out = stringlib_rpartition(
7650 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7651 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7652 );
7653
7654 Py_DECREF(sep_obj);
7655 Py_DECREF(str_obj);
7656
7657 return out;
7658}
7659
7660PyDoc_STRVAR(partition__doc__,
7661"S.partition(sep) -> (head, sep, tail)\n\
7662\n\
7663Searches for the separator sep in S, and returns the part before it,\n\
7664the separator itself, and the part after it. If the separator is not\n\
7665found, returns S and two empty strings.");
7666
7667static PyObject*
7668unicode_partition(PyUnicodeObject *self, PyObject *separator)
7669{
7670 return PyUnicode_Partition((PyObject *)self, separator);
7671}
7672
7673PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007674"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007675\n\
7676Searches for the separator sep in S, starting at the end of S, and returns\n\
7677the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007678separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007679
7680static PyObject*
7681unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7682{
7683 return PyUnicode_RPartition((PyObject *)self, separator);
7684}
7685
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007686PyObject *PyUnicode_RSplit(PyObject *s,
7687 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007688 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007689{
7690 PyObject *result;
7691
7692 s = PyUnicode_FromObject(s);
7693 if (s == NULL)
7694 return NULL;
7695 if (sep != NULL) {
7696 sep = PyUnicode_FromObject(sep);
7697 if (sep == NULL) {
7698 Py_DECREF(s);
7699 return NULL;
7700 }
7701 }
7702
7703 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7704
7705 Py_DECREF(s);
7706 Py_XDECREF(sep);
7707 return result;
7708}
7709
7710PyDoc_STRVAR(rsplit__doc__,
7711"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7712\n\
7713Return a list of the words in S, using sep as the\n\
7714delimiter string, starting at the end of the string and\n\
7715working to the front. If maxsplit is given, at most maxsplit\n\
7716splits are done. If sep is not specified, any whitespace string\n\
7717is a separator.");
7718
7719static PyObject*
7720unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7721{
7722 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007723 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007724
Martin v. Löwis18e16552006-02-15 17:27:45 +00007725 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007726 return NULL;
7727
7728 if (substring == Py_None)
7729 return rsplit(self, NULL, maxcount);
7730 else if (PyUnicode_Check(substring))
7731 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7732 else
7733 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7734}
7735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007736PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007737"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738\n\
7739Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007740Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742
7743static PyObject*
7744unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7745{
Guido van Rossum86662912000-04-11 15:38:46 +00007746 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747
Guido van Rossum86662912000-04-11 15:38:46 +00007748 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 return NULL;
7750
Guido van Rossum86662912000-04-11 15:38:46 +00007751 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752}
7753
7754static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007755PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756{
Walter Dörwald346737f2007-05-31 10:44:43 +00007757 if (PyUnicode_CheckExact(self)) {
7758 Py_INCREF(self);
7759 return self;
7760 } else
7761 /* Subtype -- return genuine unicode string with the same value. */
7762 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7763 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764}
7765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007766PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767"S.swapcase() -> unicode\n\
7768\n\
7769Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007770and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771
7772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007773unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 return fixup(self, fixswapcase);
7776}
7777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007778PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779"S.translate(table) -> unicode\n\
7780\n\
7781Return a copy of the string S, where all characters have been mapped\n\
7782through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007783Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7784Unmapped characters are left untouched. Characters mapped to None\n\
7785are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786
7787static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007788unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789{
Tim Petersced69f82003-09-16 20:30:58 +00007790 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007792 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 "ignore");
7794}
7795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007796PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797"S.upper() -> unicode\n\
7798\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007799Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800
7801static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007802unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 return fixup(self, fixupper);
7805}
7806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007807PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808"S.zfill(width) -> unicode\n\
7809\n\
7810Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007811of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
7813static PyObject *
7814unicode_zfill(PyUnicodeObject *self, PyObject *args)
7815{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007816 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 PyUnicodeObject *u;
7818
Martin v. Löwis18e16552006-02-15 17:27:45 +00007819 Py_ssize_t width;
7820 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 return NULL;
7822
7823 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007824 if (PyUnicode_CheckExact(self)) {
7825 Py_INCREF(self);
7826 return (PyObject*) self;
7827 }
7828 else
7829 return PyUnicode_FromUnicode(
7830 PyUnicode_AS_UNICODE(self),
7831 PyUnicode_GET_SIZE(self)
7832 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 }
7834
7835 fill = width - self->length;
7836
7837 u = pad(self, fill, 0, '0');
7838
Walter Dörwald068325e2002-04-15 13:36:47 +00007839 if (u == NULL)
7840 return NULL;
7841
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 if (u->str[fill] == '+' || u->str[fill] == '-') {
7843 /* move sign to beginning of string */
7844 u->str[0] = u->str[fill];
7845 u->str[fill] = '0';
7846 }
7847
7848 return (PyObject*) u;
7849}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850
7851#if 0
7852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007853unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 return PyInt_FromLong(unicode_freelist_size);
7856}
7857#endif
7858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007859PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007860"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007862Return True if S starts with the specified prefix, False otherwise.\n\
7863With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007864With optional end, stop comparing S at that position.\n\
7865prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866
7867static PyObject *
7868unicode_startswith(PyUnicodeObject *self,
7869 PyObject *args)
7870{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007871 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007873 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007874 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007875 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007877 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007878 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007880 if (PyTuple_Check(subobj)) {
7881 Py_ssize_t i;
7882 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7883 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7884 PyTuple_GET_ITEM(subobj, i));
7885 if (substring == NULL)
7886 return NULL;
7887 result = tailmatch(self, substring, start, end, -1);
7888 Py_DECREF(substring);
7889 if (result) {
7890 Py_RETURN_TRUE;
7891 }
7892 }
7893 /* nothing matched */
7894 Py_RETURN_FALSE;
7895 }
7896 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007898 return NULL;
7899 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007901 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902}
7903
7904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007905PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007906"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007908Return True if S ends with the specified suffix, False otherwise.\n\
7909With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007910With optional end, stop comparing S at that position.\n\
7911suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912
7913static PyObject *
7914unicode_endswith(PyUnicodeObject *self,
7915 PyObject *args)
7916{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007917 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007919 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007920 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007921 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007923 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7924 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007926 if (PyTuple_Check(subobj)) {
7927 Py_ssize_t i;
7928 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7929 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7930 PyTuple_GET_ITEM(subobj, i));
7931 if (substring == NULL)
7932 return NULL;
7933 result = tailmatch(self, substring, start, end, +1);
7934 Py_DECREF(substring);
7935 if (result) {
7936 Py_RETURN_TRUE;
7937 }
7938 }
7939 Py_RETURN_FALSE;
7940 }
7941 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007945 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007947 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948}
7949
Eric Smith8c663262007-08-25 02:26:07 +00007950#include "stringlib/string_format.h"
7951
7952PyDoc_STRVAR(format__doc__,
7953"S.format(*args, **kwargs) -> unicode\n\
7954\n\
7955");
7956
Eric Smith8c663262007-08-25 02:26:07 +00007957PyDoc_STRVAR(p_format__doc__,
7958"S.__format__(format_spec) -> unicode\n\
7959\n\
7960");
7961
7962static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007963unicode_getnewargs(PyUnicodeObject *v)
7964{
7965 return Py_BuildValue("(u#)", v->str, v->length);
7966}
7967
7968
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969static PyMethodDef unicode_methods[] = {
7970
7971 /* Order is according to common usage: often used methods should
7972 appear first, since lookup is done sequentially. */
7973
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007974 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7975 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7976 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007977 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007978 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7979 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7980 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7981 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7982 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7983 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7984 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007985 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007986 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7987 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7988 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007989 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007990 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7991 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7992 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007993 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007994 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007995 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007996 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007997 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7998 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7999 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8000 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8001 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8002 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8003 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8004 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8005 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8006 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8007 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8008 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8009 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8010 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008011 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008012 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008013 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8014 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008015 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8016 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008017#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008018 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019#endif
8020
8021#if 0
8022 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008023 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024#endif
8025
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008026 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 {NULL, NULL}
8028};
8029
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008030static PyObject *
8031unicode_mod(PyObject *v, PyObject *w)
8032{
8033 if (!PyUnicode_Check(v)) {
8034 Py_INCREF(Py_NotImplemented);
8035 return Py_NotImplemented;
8036 }
8037 return PyUnicode_Format(v, w);
8038}
8039
8040static PyNumberMethods unicode_as_number = {
8041 0, /*nb_add*/
8042 0, /*nb_subtract*/
8043 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008044 unicode_mod, /*nb_remainder*/
8045};
8046
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008048 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008049 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008050 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8051 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008052 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 0, /* sq_ass_item */
8054 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008055 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056};
8057
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008058static PyObject*
8059unicode_subscript(PyUnicodeObject* self, PyObject* item)
8060{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008061 if (PyIndex_Check(item)) {
8062 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008063 if (i == -1 && PyErr_Occurred())
8064 return NULL;
8065 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008066 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008067 return unicode_getitem(self, i);
8068 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008069 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008070 Py_UNICODE* source_buf;
8071 Py_UNICODE* result_buf;
8072 PyObject* result;
8073
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008074 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008075 &start, &stop, &step, &slicelength) < 0) {
8076 return NULL;
8077 }
8078
8079 if (slicelength <= 0) {
8080 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008081 } else if (start == 0 && step == 1 && slicelength == self->length &&
8082 PyUnicode_CheckExact(self)) {
8083 Py_INCREF(self);
8084 return (PyObject *)self;
8085 } else if (step == 1) {
8086 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008087 } else {
8088 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008089 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8090 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008091
8092 if (result_buf == NULL)
8093 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008094
8095 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8096 result_buf[i] = source_buf[cur];
8097 }
Tim Petersced69f82003-09-16 20:30:58 +00008098
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008099 result = PyUnicode_FromUnicode(result_buf, slicelength);
8100 PyMem_FREE(result_buf);
8101 return result;
8102 }
8103 } else {
8104 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8105 return NULL;
8106 }
8107}
8108
8109static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008110 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008111 (binaryfunc)unicode_subscript, /* mp_subscript */
8112 (objobjargproc)0, /* mp_ass_subscript */
8113};
8114
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116/* Helpers for PyUnicode_Format() */
8117
8118static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008119getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008121 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 if (argidx < arglen) {
8123 (*p_argidx)++;
8124 if (arglen < 0)
8125 return args;
8126 else
8127 return PyTuple_GetItem(args, argidx);
8128 }
8129 PyErr_SetString(PyExc_TypeError,
8130 "not enough arguments for format string");
8131 return NULL;
8132}
8133
8134#define F_LJUST (1<<0)
8135#define F_SIGN (1<<1)
8136#define F_BLANK (1<<2)
8137#define F_ALT (1<<3)
8138#define F_ZERO (1<<4)
8139
Martin v. Löwis18e16552006-02-15 17:27:45 +00008140static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008141strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143 register Py_ssize_t i;
8144 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 for (i = len - 1; i >= 0; i--)
8146 buffer[i] = (Py_UNICODE) charbuffer[i];
8147
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 return len;
8149}
8150
Neal Norwitzfc76d632006-01-10 06:03:13 +00008151static int
8152doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8153{
Tim Peters15231542006-02-16 01:08:01 +00008154 Py_ssize_t result;
8155
Neal Norwitzfc76d632006-01-10 06:03:13 +00008156 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008157 result = strtounicode(buffer, (char *)buffer);
8158 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008159}
8160
8161static int
8162longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8163{
Tim Peters15231542006-02-16 01:08:01 +00008164 Py_ssize_t result;
8165
Neal Norwitzfc76d632006-01-10 06:03:13 +00008166 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008167 result = strtounicode(buffer, (char *)buffer);
8168 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008169}
8170
Guido van Rossum078151d2002-08-11 04:24:12 +00008171/* XXX To save some code duplication, formatfloat/long/int could have been
8172 shared with stringobject.c, converting from 8-bit to Unicode after the
8173 formatting is done. */
8174
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175static int
8176formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008177 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178 int flags,
8179 int prec,
8180 int type,
8181 PyObject *v)
8182{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008183 /* fmt = '%#.' + `prec` + `type`
8184 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 char fmt[20];
8186 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008187
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 x = PyFloat_AsDouble(v);
8189 if (x == -1.0 && PyErr_Occurred())
8190 return -1;
8191 if (prec < 0)
8192 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8194 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008195 /* Worst case length calc to ensure no buffer overrun:
8196
8197 'g' formats:
8198 fmt = %#.<prec>g
8199 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8200 for any double rep.)
8201 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8202
8203 'f' formats:
8204 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8205 len = 1 + 50 + 1 + prec = 52 + prec
8206
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008207 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008208 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008209
8210 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008211 if (((type == 'g' || type == 'G') &&
8212 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008213 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008214 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008215 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008216 return -1;
8217 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008218 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8219 (flags&F_ALT) ? "#" : "",
8220 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008221 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222}
8223
Tim Peters38fd5b62000-09-21 05:43:11 +00008224static PyObject*
8225formatlong(PyObject *val, int flags, int prec, int type)
8226{
8227 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008228 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008229 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008230 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008231
8232 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8233 if (!str)
8234 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008235 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008236 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008237 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008238}
8239
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240static int
8241formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008242 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 int flags,
8244 int prec,
8245 int type,
8246 PyObject *v)
8247{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008248 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008249 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8250 * + 1 + 1
8251 * = 24
8252 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008253 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008254 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 long x;
8256
8257 x = PyInt_AsLong(v);
8258 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008259 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008260 if (x < 0 && type == 'u') {
8261 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008262 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008263 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8264 sign = "-";
8265 else
8266 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008268 prec = 1;
8269
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008270 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8271 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008272 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008273 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008274 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008275 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008276 return -1;
8277 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008278
8279 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008280 (type == 'x' || type == 'X' || type == 'o')) {
8281 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008282 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008283 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008284 * - when 0 is being converted, the C standard leaves off
8285 * the '0x' or '0X', which is inconsistent with other
8286 * %#x/%#X conversions and inconsistent with Python's
8287 * hex() function
8288 * - there are platforms that violate the standard and
8289 * convert 0 with the '0x' or '0X'
8290 * (Metrowerks, Compaq Tru64)
8291 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008292 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008293 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008294 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008295 * We can achieve the desired consistency by inserting our
8296 * own '0x' or '0X' prefix, and substituting %x/%X in place
8297 * of %#x/%#X.
8298 *
8299 * Note that this is the same approach as used in
8300 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008301 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008302 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8303 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008304 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008305 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008306 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8307 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008308 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008309 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008310 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008311 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008312 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008313 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314}
8315
8316static int
8317formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008318 size_t buflen,
8319 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008321 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008322 if (PyUnicode_Check(v)) {
8323 if (PyUnicode_GET_SIZE(v) != 1)
8324 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008326 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008328 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008329 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008330 goto onError;
8331 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8332 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333
8334 else {
8335 /* Integer input truncated to a character */
8336 long x;
8337 x = PyInt_AsLong(v);
8338 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008339 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008340#ifdef Py_UNICODE_WIDE
8341 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008342 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008343 "%c arg not in range(0x110000) "
8344 "(wide Python build)");
8345 return -1;
8346 }
8347#else
8348 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008349 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008350 "%c arg not in range(0x10000) "
8351 "(narrow Python build)");
8352 return -1;
8353 }
8354#endif
8355 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 }
8357 buf[1] = '\0';
8358 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008359
8360 onError:
8361 PyErr_SetString(PyExc_TypeError,
8362 "%c requires int or char");
8363 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364}
8365
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008366/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8367
8368 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8369 chars are formatted. XXX This is a magic number. Each formatting
8370 routine does bounds checking to ensure no overflow, but a better
8371 solution may be to malloc a buffer of appropriate size for each
8372 format. For now, the current solution is sufficient.
8373*/
8374#define FORMATBUFLEN (size_t)120
8375
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376PyObject *PyUnicode_Format(PyObject *format,
8377 PyObject *args)
8378{
8379 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008380 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 int args_owned = 0;
8382 PyUnicodeObject *result = NULL;
8383 PyObject *dict = NULL;
8384 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008385
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 if (format == NULL || args == NULL) {
8387 PyErr_BadInternalCall();
8388 return NULL;
8389 }
8390 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008391 if (uformat == NULL)
8392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 fmt = PyUnicode_AS_UNICODE(uformat);
8394 fmtcnt = PyUnicode_GET_SIZE(uformat);
8395
8396 reslen = rescnt = fmtcnt + 100;
8397 result = _PyUnicode_New(reslen);
8398 if (result == NULL)
8399 goto onError;
8400 res = PyUnicode_AS_UNICODE(result);
8401
8402 if (PyTuple_Check(args)) {
8403 arglen = PyTuple_Size(args);
8404 argidx = 0;
8405 }
8406 else {
8407 arglen = -1;
8408 argidx = -2;
8409 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008410 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008411 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 dict = args;
8413
8414 while (--fmtcnt >= 0) {
8415 if (*fmt != '%') {
8416 if (--rescnt < 0) {
8417 rescnt = fmtcnt + 100;
8418 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008419 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008420 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8422 --rescnt;
8423 }
8424 *res++ = *fmt++;
8425 }
8426 else {
8427 /* Got a format specifier */
8428 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008429 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 Py_UNICODE c = '\0';
8432 Py_UNICODE fill;
8433 PyObject *v = NULL;
8434 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008435 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008437 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008438 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439
8440 fmt++;
8441 if (*fmt == '(') {
8442 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008443 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 PyObject *key;
8445 int pcount = 1;
8446
8447 if (dict == NULL) {
8448 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008449 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 goto onError;
8451 }
8452 ++fmt;
8453 --fmtcnt;
8454 keystart = fmt;
8455 /* Skip over balanced parentheses */
8456 while (pcount > 0 && --fmtcnt >= 0) {
8457 if (*fmt == ')')
8458 --pcount;
8459 else if (*fmt == '(')
8460 ++pcount;
8461 fmt++;
8462 }
8463 keylen = fmt - keystart - 1;
8464 if (fmtcnt < 0 || pcount > 0) {
8465 PyErr_SetString(PyExc_ValueError,
8466 "incomplete format key");
8467 goto onError;
8468 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008469#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008470 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471 then looked up since Python uses strings to hold
8472 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008473 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474 key = PyUnicode_EncodeUTF8(keystart,
8475 keylen,
8476 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008477#else
8478 key = PyUnicode_FromUnicode(keystart, keylen);
8479#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480 if (key == NULL)
8481 goto onError;
8482 if (args_owned) {
8483 Py_DECREF(args);
8484 args_owned = 0;
8485 }
8486 args = PyObject_GetItem(dict, key);
8487 Py_DECREF(key);
8488 if (args == NULL) {
8489 goto onError;
8490 }
8491 args_owned = 1;
8492 arglen = -1;
8493 argidx = -2;
8494 }
8495 while (--fmtcnt >= 0) {
8496 switch (c = *fmt++) {
8497 case '-': flags |= F_LJUST; continue;
8498 case '+': flags |= F_SIGN; continue;
8499 case ' ': flags |= F_BLANK; continue;
8500 case '#': flags |= F_ALT; continue;
8501 case '0': flags |= F_ZERO; continue;
8502 }
8503 break;
8504 }
8505 if (c == '*') {
8506 v = getnextarg(args, arglen, &argidx);
8507 if (v == NULL)
8508 goto onError;
8509 if (!PyInt_Check(v)) {
8510 PyErr_SetString(PyExc_TypeError,
8511 "* wants int");
8512 goto onError;
8513 }
8514 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008515 if (width == -1 && PyErr_Occurred())
8516 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 if (width < 0) {
8518 flags |= F_LJUST;
8519 width = -width;
8520 }
8521 if (--fmtcnt >= 0)
8522 c = *fmt++;
8523 }
8524 else if (c >= '0' && c <= '9') {
8525 width = c - '0';
8526 while (--fmtcnt >= 0) {
8527 c = *fmt++;
8528 if (c < '0' || c > '9')
8529 break;
8530 if ((width*10) / 10 != width) {
8531 PyErr_SetString(PyExc_ValueError,
8532 "width too big");
8533 goto onError;
8534 }
8535 width = width*10 + (c - '0');
8536 }
8537 }
8538 if (c == '.') {
8539 prec = 0;
8540 if (--fmtcnt >= 0)
8541 c = *fmt++;
8542 if (c == '*') {
8543 v = getnextarg(args, arglen, &argidx);
8544 if (v == NULL)
8545 goto onError;
8546 if (!PyInt_Check(v)) {
8547 PyErr_SetString(PyExc_TypeError,
8548 "* wants int");
8549 goto onError;
8550 }
8551 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008552 if (prec == -1 && PyErr_Occurred())
8553 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 if (prec < 0)
8555 prec = 0;
8556 if (--fmtcnt >= 0)
8557 c = *fmt++;
8558 }
8559 else if (c >= '0' && c <= '9') {
8560 prec = c - '0';
8561 while (--fmtcnt >= 0) {
8562 c = Py_CHARMASK(*fmt++);
8563 if (c < '0' || c > '9')
8564 break;
8565 if ((prec*10) / 10 != prec) {
8566 PyErr_SetString(PyExc_ValueError,
8567 "prec too big");
8568 goto onError;
8569 }
8570 prec = prec*10 + (c - '0');
8571 }
8572 }
8573 } /* prec */
8574 if (fmtcnt >= 0) {
8575 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 if (--fmtcnt >= 0)
8577 c = *fmt++;
8578 }
8579 }
8580 if (fmtcnt < 0) {
8581 PyErr_SetString(PyExc_ValueError,
8582 "incomplete format");
8583 goto onError;
8584 }
8585 if (c != '%') {
8586 v = getnextarg(args, arglen, &argidx);
8587 if (v == NULL)
8588 goto onError;
8589 }
8590 sign = 0;
8591 fill = ' ';
8592 switch (c) {
8593
8594 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008595 pbuf = formatbuf;
8596 /* presume that buffer length is at least 1 */
8597 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 len = 1;
8599 break;
8600
8601 case 's':
8602 case 'r':
8603 if (PyUnicode_Check(v) && c == 's') {
8604 temp = v;
8605 Py_INCREF(temp);
8606 }
8607 else {
8608 PyObject *unicode;
8609 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008610 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 else
8612 temp = PyObject_Repr(v);
8613 if (temp == NULL)
8614 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008615 if (PyUnicode_Check(temp))
8616 /* nothing to do */;
8617 else if (PyString_Check(temp)) {
8618 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008619 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008621 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008623 Py_DECREF(temp);
8624 temp = unicode;
8625 if (temp == NULL)
8626 goto onError;
8627 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008628 else {
8629 Py_DECREF(temp);
8630 PyErr_SetString(PyExc_TypeError,
8631 "%s argument has non-string str()");
8632 goto onError;
8633 }
8634 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008635 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 len = PyUnicode_GET_SIZE(temp);
8637 if (prec >= 0 && len > prec)
8638 len = prec;
8639 break;
8640
8641 case 'i':
8642 case 'd':
8643 case 'u':
8644 case 'o':
8645 case 'x':
8646 case 'X':
8647 if (c == 'i')
8648 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008649 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008650 temp = formatlong(v, flags, prec, c);
8651 if (!temp)
8652 goto onError;
8653 pbuf = PyUnicode_AS_UNICODE(temp);
8654 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008655 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008657 else {
8658 pbuf = formatbuf;
8659 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8660 flags, prec, c, v);
8661 if (len < 0)
8662 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008663 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008664 }
8665 if (flags & F_ZERO)
8666 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667 break;
8668
8669 case 'e':
8670 case 'E':
8671 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008672 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 case 'g':
8674 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008675 if (c == 'F')
8676 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008677 pbuf = formatbuf;
8678 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8679 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 if (len < 0)
8681 goto onError;
8682 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008683 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 fill = '0';
8685 break;
8686
8687 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008688 pbuf = formatbuf;
8689 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 if (len < 0)
8691 goto onError;
8692 break;
8693
8694 default:
8695 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008696 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008697 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008698 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008699 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008700 (Py_ssize_t)(fmt - 1 -
8701 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 goto onError;
8703 }
8704 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008705 if (*pbuf == '-' || *pbuf == '+') {
8706 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707 len--;
8708 }
8709 else if (flags & F_SIGN)
8710 sign = '+';
8711 else if (flags & F_BLANK)
8712 sign = ' ';
8713 else
8714 sign = 0;
8715 }
8716 if (width < len)
8717 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008718 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719 reslen -= rescnt;
8720 rescnt = width + fmtcnt + 100;
8721 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008722 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008723 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008724 PyErr_NoMemory();
8725 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008726 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008727 if (_PyUnicode_Resize(&result, reslen) < 0) {
8728 Py_XDECREF(temp);
8729 goto onError;
8730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 res = PyUnicode_AS_UNICODE(result)
8732 + reslen - rescnt;
8733 }
8734 if (sign) {
8735 if (fill != ' ')
8736 *res++ = sign;
8737 rescnt--;
8738 if (width > len)
8739 width--;
8740 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008741 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008742 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008743 assert(pbuf[1] == c);
8744 if (fill != ' ') {
8745 *res++ = *pbuf++;
8746 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008747 }
Tim Petersfff53252001-04-12 18:38:48 +00008748 rescnt -= 2;
8749 width -= 2;
8750 if (width < 0)
8751 width = 0;
8752 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 if (width > len && !(flags & F_LJUST)) {
8755 do {
8756 --rescnt;
8757 *res++ = fill;
8758 } while (--width > len);
8759 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008760 if (fill == ' ') {
8761 if (sign)
8762 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008763 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008764 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008765 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008766 *res++ = *pbuf++;
8767 *res++ = *pbuf++;
8768 }
8769 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008770 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 res += len;
8772 rescnt -= len;
8773 while (--width >= len) {
8774 --rescnt;
8775 *res++ = ' ';
8776 }
8777 if (dict && (argidx < arglen) && c != '%') {
8778 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008779 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008780 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 goto onError;
8782 }
8783 Py_XDECREF(temp);
8784 } /* '%' */
8785 } /* until end */
8786 if (argidx < arglen && !dict) {
8787 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008788 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 goto onError;
8790 }
8791
Thomas Woutersa96affe2006-03-12 00:29:36 +00008792 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8793 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794 if (args_owned) {
8795 Py_DECREF(args);
8796 }
8797 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 return (PyObject *)result;
8799
8800 onError:
8801 Py_XDECREF(result);
8802 Py_DECREF(uformat);
8803 if (args_owned) {
8804 Py_DECREF(args);
8805 }
8806 return NULL;
8807}
8808
Jeremy Hylton938ace62002-07-17 16:30:39 +00008809static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008810unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8811
Tim Peters6d6c1a32001-08-02 04:15:00 +00008812static PyObject *
8813unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8814{
8815 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008816 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008817 char *encoding = NULL;
8818 char *errors = NULL;
8819
Guido van Rossume023fe02001-08-30 03:12:59 +00008820 if (type != &PyUnicode_Type)
8821 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008822 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8823 kwlist, &x, &encoding, &errors))
8824 return NULL;
8825 if (x == NULL)
8826 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008827 if (encoding == NULL && errors == NULL)
8828 return PyObject_Unicode(x);
8829 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008830 return PyUnicode_FromEncodedObject(x, encoding, errors);
8831}
8832
Guido van Rossume023fe02001-08-30 03:12:59 +00008833static PyObject *
8834unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8835{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008836 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008837 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008838
8839 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8840 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8841 if (tmp == NULL)
8842 return NULL;
8843 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008844 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008845 if (pnew == NULL) {
8846 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008847 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008848 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008849 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8850 if (pnew->str == NULL) {
8851 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008852 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008853 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008854 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008855 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008856 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8857 pnew->length = n;
8858 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008859 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008860 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008861}
8862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008863PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008864"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008865\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008866Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008867encoding defaults to the current default string encoding.\n\
8868errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008869
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008870static PyObject *unicode_iter(PyObject *seq);
8871
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008873 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008874 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 sizeof(PyUnicodeObject), /* tp_size */
8876 0, /* tp_itemsize */
8877 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008878 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008880 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008882 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008883 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008884 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008886 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 (hashfunc) unicode_hash, /* tp_hash*/
8888 0, /* tp_call*/
8889 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008890 PyObject_GenericGetAttr, /* tp_getattro */
8891 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00008892 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008893 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8894 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008895 unicode_doc, /* tp_doc */
8896 0, /* tp_traverse */
8897 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008898 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008899 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008900 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008901 0, /* tp_iternext */
8902 unicode_methods, /* tp_methods */
8903 0, /* tp_members */
8904 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008905 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008906 0, /* tp_dict */
8907 0, /* tp_descr_get */
8908 0, /* tp_descr_set */
8909 0, /* tp_dictoffset */
8910 0, /* tp_init */
8911 0, /* tp_alloc */
8912 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008913 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914};
8915
8916/* Initialize the Unicode implementation */
8917
Thomas Wouters78890102000-07-22 19:25:51 +00008918void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008920 int i;
8921
Thomas Wouters477c8d52006-05-27 19:21:47 +00008922 /* XXX - move this array to unicodectype.c ? */
8923 Py_UNICODE linebreak[] = {
8924 0x000A, /* LINE FEED */
8925 0x000D, /* CARRIAGE RETURN */
8926 0x001C, /* FILE SEPARATOR */
8927 0x001D, /* GROUP SEPARATOR */
8928 0x001E, /* RECORD SEPARATOR */
8929 0x0085, /* NEXT LINE */
8930 0x2028, /* LINE SEPARATOR */
8931 0x2029, /* PARAGRAPH SEPARATOR */
8932 };
8933
Fred Drakee4315f52000-05-09 19:53:39 +00008934 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008935 unicode_freelist = NULL;
8936 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008938 if (!unicode_empty)
8939 return;
8940
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008941 for (i = 0; i < 256; i++)
8942 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008943 if (PyType_Ready(&PyUnicode_Type) < 0)
8944 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008945
8946 /* initialize the linebreak bloom filter */
8947 bloom_linebreak = make_bloom_mask(
8948 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8949 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008950
8951 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952}
8953
8954/* Finalize the Unicode implementation */
8955
8956void
Thomas Wouters78890102000-07-22 19:25:51 +00008957_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008959 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008960 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008962 Py_XDECREF(unicode_empty);
8963 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008964
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008965 for (i = 0; i < 256; i++) {
8966 if (unicode_latin1[i]) {
8967 Py_DECREF(unicode_latin1[i]);
8968 unicode_latin1[i] = NULL;
8969 }
8970 }
8971
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008972 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 PyUnicodeObject *v = u;
8974 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008975 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008976 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008977 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008978 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008980 unicode_freelist = NULL;
8981 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008983
Walter Dörwald16807132007-05-25 13:52:07 +00008984void
8985PyUnicode_InternInPlace(PyObject **p)
8986{
8987 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8988 PyObject *t;
8989 if (s == NULL || !PyUnicode_Check(s))
8990 Py_FatalError(
8991 "PyUnicode_InternInPlace: unicode strings only please!");
8992 /* If it's a subclass, we don't really know what putting
8993 it in the interned dict might do. */
8994 if (!PyUnicode_CheckExact(s))
8995 return;
8996 if (PyUnicode_CHECK_INTERNED(s))
8997 return;
8998 if (interned == NULL) {
8999 interned = PyDict_New();
9000 if (interned == NULL) {
9001 PyErr_Clear(); /* Don't leave an exception */
9002 return;
9003 }
9004 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009005 /* It might be that the GetItem call fails even
9006 though the key is present in the dictionary,
9007 namely when this happens during a stack overflow. */
9008 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009009 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009010 Py_END_ALLOW_RECURSION
9011
Walter Dörwald16807132007-05-25 13:52:07 +00009012 if (t) {
9013 Py_INCREF(t);
9014 Py_DECREF(*p);
9015 *p = t;
9016 return;
9017 }
9018
Martin v. Löwis5b222132007-06-10 09:51:05 +00009019 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009020 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9021 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009022 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009023 return;
9024 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009025 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009026 /* The two references in interned are not counted by refcnt.
9027 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009028 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009029 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9030}
9031
9032void
9033PyUnicode_InternImmortal(PyObject **p)
9034{
9035 PyUnicode_InternInPlace(p);
9036 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9037 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9038 Py_INCREF(*p);
9039 }
9040}
9041
9042PyObject *
9043PyUnicode_InternFromString(const char *cp)
9044{
9045 PyObject *s = PyUnicode_FromString(cp);
9046 if (s == NULL)
9047 return NULL;
9048 PyUnicode_InternInPlace(&s);
9049 return s;
9050}
9051
9052void _Py_ReleaseInternedUnicodeStrings(void)
9053{
9054 PyObject *keys;
9055 PyUnicodeObject *s;
9056 Py_ssize_t i, n;
9057 Py_ssize_t immortal_size = 0, mortal_size = 0;
9058
9059 if (interned == NULL || !PyDict_Check(interned))
9060 return;
9061 keys = PyDict_Keys(interned);
9062 if (keys == NULL || !PyList_Check(keys)) {
9063 PyErr_Clear();
9064 return;
9065 }
9066
9067 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9068 detector, interned unicode strings are not forcibly deallocated;
9069 rather, we give them their stolen references back, and then clear
9070 and DECREF the interned dict. */
9071
9072 n = PyList_GET_SIZE(keys);
9073 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9074 n);
9075 for (i = 0; i < n; i++) {
9076 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9077 switch (s->state) {
9078 case SSTATE_NOT_INTERNED:
9079 /* XXX Shouldn't happen */
9080 break;
9081 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009082 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009083 immortal_size += s->length;
9084 break;
9085 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009086 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009087 mortal_size += s->length;
9088 break;
9089 default:
9090 Py_FatalError("Inconsistent interned string state.");
9091 }
9092 s->state = SSTATE_NOT_INTERNED;
9093 }
9094 fprintf(stderr, "total size of all interned strings: "
9095 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9096 "mortal/immortal\n", mortal_size, immortal_size);
9097 Py_DECREF(keys);
9098 PyDict_Clear(interned);
9099 Py_DECREF(interned);
9100 interned = NULL;
9101}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009102
9103
9104/********************* Unicode Iterator **************************/
9105
9106typedef struct {
9107 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009108 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009109 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9110} unicodeiterobject;
9111
9112static void
9113unicodeiter_dealloc(unicodeiterobject *it)
9114{
9115 _PyObject_GC_UNTRACK(it);
9116 Py_XDECREF(it->it_seq);
9117 PyObject_GC_Del(it);
9118}
9119
9120static int
9121unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9122{
9123 Py_VISIT(it->it_seq);
9124 return 0;
9125}
9126
9127static PyObject *
9128unicodeiter_next(unicodeiterobject *it)
9129{
9130 PyUnicodeObject *seq;
9131 PyObject *item;
9132
9133 assert(it != NULL);
9134 seq = it->it_seq;
9135 if (seq == NULL)
9136 return NULL;
9137 assert(PyUnicode_Check(seq));
9138
9139 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009140 item = PyUnicode_FromUnicode(
9141 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009142 if (item != NULL)
9143 ++it->it_index;
9144 return item;
9145 }
9146
9147 Py_DECREF(seq);
9148 it->it_seq = NULL;
9149 return NULL;
9150}
9151
9152static PyObject *
9153unicodeiter_len(unicodeiterobject *it)
9154{
9155 Py_ssize_t len = 0;
9156 if (it->it_seq)
9157 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9158 return PyInt_FromSsize_t(len);
9159}
9160
9161PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9162
9163static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009164 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9165 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009166 {NULL, NULL} /* sentinel */
9167};
9168
9169PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009170 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009171 "unicodeiterator", /* tp_name */
9172 sizeof(unicodeiterobject), /* tp_basicsize */
9173 0, /* tp_itemsize */
9174 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009175 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009176 0, /* tp_print */
9177 0, /* tp_getattr */
9178 0, /* tp_setattr */
9179 0, /* tp_compare */
9180 0, /* tp_repr */
9181 0, /* tp_as_number */
9182 0, /* tp_as_sequence */
9183 0, /* tp_as_mapping */
9184 0, /* tp_hash */
9185 0, /* tp_call */
9186 0, /* tp_str */
9187 PyObject_GenericGetAttr, /* tp_getattro */
9188 0, /* tp_setattro */
9189 0, /* tp_as_buffer */
9190 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9191 0, /* tp_doc */
9192 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9193 0, /* tp_clear */
9194 0, /* tp_richcompare */
9195 0, /* tp_weaklistoffset */
9196 PyObject_SelfIter, /* tp_iter */
9197 (iternextfunc)unicodeiter_next, /* tp_iternext */
9198 unicodeiter_methods, /* tp_methods */
9199 0,
9200};
9201
9202static PyObject *
9203unicode_iter(PyObject *seq)
9204{
9205 unicodeiterobject *it;
9206
9207 if (!PyUnicode_Check(seq)) {
9208 PyErr_BadInternalCall();
9209 return NULL;
9210 }
9211 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9212 if (it == NULL)
9213 return NULL;
9214 it->it_index = 0;
9215 Py_INCREF(seq);
9216 it->it_seq = (PyUnicodeObject *)seq;
9217 _PyObject_GC_TRACK(it);
9218 return (PyObject *)it;
9219}
9220
Martin v. Löwis5b222132007-06-10 09:51:05 +00009221size_t
9222Py_UNICODE_strlen(const Py_UNICODE *u)
9223{
9224 int res = 0;
9225 while(*u++)
9226 res++;
9227 return res;
9228}
9229
9230Py_UNICODE*
9231Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9232{
9233 Py_UNICODE *u = s1;
9234 while ((*u++ = *s2++));
9235 return s1;
9236}
9237
9238Py_UNICODE*
9239Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9240{
9241 Py_UNICODE *u = s1;
9242 while ((*u++ = *s2++))
9243 if (n-- == 0)
9244 break;
9245 return s1;
9246}
9247
9248int
9249Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9250{
9251 while (*s1 && *s2 && *s1 == *s2)
9252 s1++, s2++;
9253 if (*s1 && *s2)
9254 return (*s1 < *s2) ? -1 : +1;
9255 if (*s1)
9256 return 1;
9257 if (*s2)
9258 return -1;
9259 return 0;
9260}
9261
9262Py_UNICODE*
9263Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9264{
9265 const Py_UNICODE *p;
9266 for (p = s; *p; p++)
9267 if (*p == c)
9268 return (Py_UNICODE*)p;
9269 return NULL;
9270}
9271
9272
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009273#ifdef __cplusplus
9274}
9275#endif
9276
9277
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009278/*
9279Local variables:
9280c-basic-offset: 4
9281indent-tabs-mode: nil
9282End:
9283*/