blob: a24cdba41b918d7bff8db593b37e2116c3c35ac1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000120 PyUnicode_GetDefaultEncoding() API to access this global.
121
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000122 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000123 hard coded default!
124*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000125static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000126
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000128PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000129{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000130#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000131 return 0x10FFFF;
132#else
133 /* This is actually an illegal character, so it should
134 not be passed to unichr. */
135 return 0xFFFF;
136#endif
137}
138
Thomas Wouters477c8d52006-05-27 19:21:47 +0000139/* --- Bloom Filters ----------------------------------------------------- */
140
141/* stuff to implement simple "bloom filters" for Unicode characters.
142 to keep things simple, we use a single bitmask, using the least 5
143 bits from each unicode characters as the bit index. */
144
145/* the linebreak mask is set up by Unicode_Init below */
146
147#define BLOOM_MASK unsigned long
148
149static BLOOM_MASK bloom_linebreak;
150
151#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
152
153#define BLOOM_LINEBREAK(ch)\
154 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
155
156Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
157{
158 /* calculate simple bloom-style bitmask for a given unicode string */
159
160 long mask;
161 Py_ssize_t i;
162
163 mask = 0;
164 for (i = 0; i < len; i++)
165 mask |= (1 << (ptr[i] & 0x1F));
166
167 return mask;
168}
169
170Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
171{
172 Py_ssize_t i;
173
174 for (i = 0; i < setlen; i++)
175 if (set[i] == chr)
176 return 1;
177
178 return 0;
179}
180
181#define BLOOM_MEMBER(mask, chr, set, setlen)\
182 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
183
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184/* --- Unicode Object ----------------------------------------------------- */
185
186static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000188 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189{
190 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000191
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000192 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000194 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196 /* Resizing shared object (unicode_empty or single character
197 objects) in-place is not allowed. Use PyUnicode_Resize()
198 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000200 if (unicode == unicode_empty ||
201 (unicode->length == 1 &&
202 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 return -1;
207 }
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209 /* We allocate one more byte to make sure the string is Ux0000 terminated.
210 The overallocation is also used by fastsearch, which assumes that it's
211 safe to look at str[length] (without making any assumptions about what
212 it contains). */
213
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 oldstr = unicode->str;
215 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
216 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000217 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 PyErr_NoMemory();
219 return -1;
220 }
221 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000222 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000224 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000226 if (unicode->defenc) {
227 Py_DECREF(unicode->defenc);
228 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 }
230 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000231
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 return 0;
233}
234
235/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000236 Ux0000 terminated; some code (e.g. new_identifier)
237 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238
239 XXX This allocator could further be enhanced by assuring that the
240 free list never reduces its size below 1.
241
242*/
243
244static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000245PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246{
247 register PyUnicodeObject *unicode;
248
Thomas Wouters477c8d52006-05-27 19:21:47 +0000249 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 if (length == 0 && unicode_empty != NULL) {
251 Py_INCREF(unicode_empty);
252 return unicode_empty;
253 }
254
255 /* Unicode freelist & memory allocation */
256 if (unicode_freelist) {
257 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000258 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000261 /* Keep-Alive optimization: we only upsize the buffer,
262 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000263 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000264 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000265 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000266 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000269 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000271 }
272 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 }
274 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000275 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 if (unicode == NULL)
277 return NULL;
278 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
279 }
280
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000281 if (!unicode->str) {
282 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000283 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000284 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000286 * the caller fails before initializing str -- unicode_resize()
287 * reads str[0], and the Keep-Alive optimization can keep memory
288 * allocated for str alive across a call to unicode_dealloc(unicode).
289 * We don't want unicode_resize to read uninitialized memory in
290 * that case.
291 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000292 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000294 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000296 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299
300 onError:
301 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000302 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304}
305
306static
Guido van Rossum9475a232001-10-05 20:51:39 +0000307void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308{
Walter Dörwald16807132007-05-25 13:52:07 +0000309 switch (PyUnicode_CHECK_INTERNED(unicode)) {
310 case SSTATE_NOT_INTERNED:
311 break;
312
313 case SSTATE_INTERNED_MORTAL:
314 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000315 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000316 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
317 Py_FatalError(
318 "deletion of interned unicode string failed");
319 break;
320
321 case SSTATE_INTERNED_IMMORTAL:
322 Py_FatalError("Immortal interned unicode string died.");
323
324 default:
325 Py_FatalError("Inconsistent interned unicode string state.");
326 }
327
Guido van Rossum604ddf82001-12-06 20:03:56 +0000328 if (PyUnicode_CheckExact(unicode) &&
329 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000330 /* Keep-Alive optimization */
331 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000332 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 unicode->str = NULL;
334 unicode->length = 0;
335 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000336 if (unicode->defenc) {
337 Py_DECREF(unicode->defenc);
338 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000339 }
340 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 *(PyUnicodeObject **)unicode = unicode_freelist;
342 unicode_freelist = unicode;
343 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344 }
345 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000346 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000347 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000348 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350}
351
Martin v. Löwis18e16552006-02-15 17:27:45 +0000352int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000353{
354 register PyUnicodeObject *v;
355
356 /* Argument checks */
357 if (unicode == NULL) {
358 PyErr_BadInternalCall();
359 return -1;
360 }
361 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000362 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 PyErr_BadInternalCall();
364 return -1;
365 }
366
367 /* Resizing unicode_empty and single character objects is not
368 possible since these are being shared. We simply return a fresh
369 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000370 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 (v == unicode_empty || v->length == 1)) {
372 PyUnicodeObject *w = _PyUnicode_New(length);
373 if (w == NULL)
374 return -1;
375 Py_UNICODE_COPY(w->str, v->str,
376 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000377 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 *unicode = (PyObject *)w;
379 return 0;
380 }
381
382 /* Note that we don't have to modify *unicode for unshared Unicode
383 objects, since we can modify them in-place. */
384 return unicode_resize(v, length);
385}
386
387/* Internal API for use in unicodeobject.c only ! */
388#define _PyUnicode_Resize(unicodevar, length) \
389 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
390
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000392 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393{
394 PyUnicodeObject *unicode;
395
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000396 /* If the Unicode data is known at construction time, we can apply
397 some optimizations which share commonly used objects. */
398 if (u != NULL) {
399
400 /* Optimization for empty strings */
401 if (size == 0 && unicode_empty != NULL) {
402 Py_INCREF(unicode_empty);
403 return (PyObject *)unicode_empty;
404 }
405
406 /* Single character Unicode objects in the Latin-1 range are
407 shared when using this constructor */
408 if (size == 1 && *u < 256) {
409 unicode = unicode_latin1[*u];
410 if (!unicode) {
411 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000412 if (!unicode)
413 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000414 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415 unicode_latin1[*u] = unicode;
416 }
417 Py_INCREF(unicode);
418 return (PyObject *)unicode;
419 }
420 }
Tim Petersced69f82003-09-16 20:30:58 +0000421
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422 unicode = _PyUnicode_New(size);
423 if (!unicode)
424 return NULL;
425
426 /* Copy the Unicode data into the new object */
427 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429
430 return (PyObject *)unicode;
431}
432
Walter Dörwaldd2034312007-05-18 16:29:38 +0000433PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000434{
435 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000437 some optimizations which share commonly used objects.
438 Also, this means the input must be UTF-8, so fall back to the
439 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000440 if (u != NULL) {
441
442 /* Optimization for empty strings */
443 if (size == 0 && unicode_empty != NULL) {
444 Py_INCREF(unicode_empty);
445 return (PyObject *)unicode_empty;
446 }
447
Martin v. Löwis9c121062007-08-05 20:26:11 +0000448 /* Single characters are shared when using this constructor.
449 Restrict to ASCII, since the input must be UTF-8. */
450 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000451 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000452 if (!unicode) {
453 unicode = _PyUnicode_New(1);
454 if (!unicode)
455 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000456 unicode->str[0] = Py_CHARMASK(*u);
457 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000458 }
459 Py_INCREF(unicode);
460 return (PyObject *)unicode;
461 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000462
463 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000464 }
465
Walter Dörwald55507312007-05-18 13:12:10 +0000466 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000467 if (!unicode)
468 return NULL;
469
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000470 return (PyObject *)unicode;
471}
472
Walter Dörwaldd2034312007-05-18 16:29:38 +0000473PyObject *PyUnicode_FromString(const char *u)
474{
475 size_t size = strlen(u);
476 if (size > PY_SSIZE_T_MAX) {
477 PyErr_SetString(PyExc_OverflowError, "input too long");
478 return NULL;
479 }
480
481 return PyUnicode_FromStringAndSize(u, size);
482}
483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484#ifdef HAVE_WCHAR_H
485
486PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
491 if (w == NULL) {
492 PyErr_BadInternalCall();
493 return NULL;
494 }
495
496 unicode = _PyUnicode_New(size);
497 if (!unicode)
498 return NULL;
499
500 /* Copy the wchar_t data into the new object */
501#ifdef HAVE_USABLE_WCHAR_T
502 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000503#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 {
505 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000506 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000508 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 *u++ = *w++;
510 }
511#endif
512
513 return (PyObject *)unicode;
514}
515
Walter Dörwald346737f2007-05-31 10:44:43 +0000516static void
517makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
518{
519 *fmt++ = '%';
520 if (width) {
521 if (zeropad)
522 *fmt++ = '0';
523 fmt += sprintf(fmt, "%d", width);
524 }
525 if (precision)
526 fmt += sprintf(fmt, ".%d", precision);
527 if (longflag)
528 *fmt++ = 'l';
529 else if (size_tflag) {
530 char *f = PY_FORMAT_SIZE_T;
531 while (*f)
532 *fmt++ = *f++;
533 }
534 *fmt++ = c;
535 *fmt = '\0';
536}
537
Walter Dörwaldd2034312007-05-18 16:29:38 +0000538#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
539
540PyObject *
541PyUnicode_FromFormatV(const char *format, va_list vargs)
542{
543 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000544 Py_ssize_t callcount = 0;
545 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000546 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000548 int width = 0;
549 int precision = 0;
550 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000551 const char* f;
552 Py_UNICODE *s;
553 PyObject *string;
554 /* used by sprintf */
555 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000556 /* use abuffer instead of buffer, if we need more space
557 * (which can happen if there's a format specifier with width). */
558 char *abuffer = NULL;
559 char *realbuffer;
560 Py_ssize_t abuffersize = 0;
561 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000562 const char *copy;
563
564#ifdef VA_LIST_IS_ARRAY
565 Py_MEMCPY(count, vargs, sizeof(va_list));
566#else
567#ifdef __va_copy
568 __va_copy(count, vargs);
569#else
570 count = vargs;
571#endif
572#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 /* step 1: count the number of %S/%R format specifications
574 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
575 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000576 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 ++callcount;
579 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000580 /* step 2: allocate memory for the results of
581 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000582 if (callcount) {
583 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
584 if (!callresults) {
585 PyErr_NoMemory();
586 return NULL;
587 }
588 callresult = callresults;
589 }
590 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000591 for (f = format; *f; f++) {
592 if (*f == '%') {
593 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000594 width = 0;
595 while (isdigit(Py_CHARMASK(*f)))
596 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000597 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
598 ;
599
600 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
601 * they don't affect the amount of space we reserve.
602 */
603 if ((*f == 'l' || *f == 'z') &&
604 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000605 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000606
607 switch (*f) {
608 case 'c':
609 (void)va_arg(count, int);
610 /* fall through... */
611 case '%':
612 n++;
613 break;
614 case 'd': case 'u': case 'i': case 'x':
615 (void) va_arg(count, int);
616 /* 20 bytes is enough to hold a 64-bit
617 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000618 This isn't enough for octal.
619 If a width is specified we need more
620 (which we allocate later). */
621 if (width < 20)
622 width = 20;
623 n += width;
624 if (abuffersize < width)
625 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000626 break;
627 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000628 {
629 /* UTF-8 */
630 unsigned char*s;
631 s = va_arg(count, unsigned char*);
632 while (*s) {
633 if (*s < 128) {
634 n++; s++;
635 } else if (*s < 0xc0) {
636 /* invalid UTF-8 */
637 n++; s++;
638 } else if (*s < 0xc0) {
639 n++;
640 s++; if(!*s)break;
641 s++;
642 } else if (*s < 0xe0) {
643 n++;
644 s++; if(!*s)break;
645 s++; if(!*s)break;
646 s++;
647 } else {
648 #ifdef Py_UNICODE_WIDE
649 n++;
650 #else
651 n+=2;
652 #endif
653 s++; if(!*s)break;
654 s++; if(!*s)break;
655 s++; if(!*s)break;
656 s++;
657 }
658 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000659 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000660 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000661 case 'U':
662 {
663 PyObject *obj = va_arg(count, PyObject *);
664 assert(obj && PyUnicode_Check(obj));
665 n += PyUnicode_GET_SIZE(obj);
666 break;
667 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000668 case 'V':
669 {
670 PyObject *obj = va_arg(count, PyObject *);
671 const char *str = va_arg(count, const char *);
672 assert(obj || str);
673 assert(!obj || PyUnicode_Check(obj));
674 if (obj)
675 n += PyUnicode_GET_SIZE(obj);
676 else
677 n += strlen(str);
678 break;
679 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000680 case 'S':
681 {
682 PyObject *obj = va_arg(count, PyObject *);
683 PyObject *str;
684 assert(obj);
685 str = PyObject_Unicode(obj);
686 if (!str)
687 goto fail;
688 n += PyUnicode_GET_SIZE(str);
689 /* Remember the str and switch to the next slot */
690 *callresult++ = str;
691 break;
692 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000693 case 'R':
694 {
695 PyObject *obj = va_arg(count, PyObject *);
696 PyObject *repr;
697 assert(obj);
698 repr = PyObject_Repr(obj);
699 if (!repr)
700 goto fail;
701 n += PyUnicode_GET_SIZE(repr);
702 /* Remember the repr and switch to the next slot */
703 *callresult++ = repr;
704 break;
705 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 case 'p':
707 (void) va_arg(count, int);
708 /* maximum 64-bit pointer representation:
709 * 0xffffffffffffffff
710 * so 19 characters is enough.
711 * XXX I count 18 -- what's the extra for?
712 */
713 n += 19;
714 break;
715 default:
716 /* if we stumble upon an unknown
717 formatting code, copy the rest of
718 the format string to the output
719 string. (we cannot just skip the
720 code, since there's no way to know
721 what's in the argument list) */
722 n += strlen(p);
723 goto expand;
724 }
725 } else
726 n++;
727 }
728 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000729 if (abuffersize > 20) {
730 abuffer = PyMem_Malloc(abuffersize);
731 if (!abuffer) {
732 PyErr_NoMemory();
733 goto fail;
734 }
735 realbuffer = abuffer;
736 }
737 else
738 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000739 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000740 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000741 we don't have to resize the string.
742 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000743 string = PyUnicode_FromUnicode(NULL, n);
744 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000745 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746
747 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000748 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749
750 for (f = format; *f; f++) {
751 if (*f == '%') {
752 const char* p = f++;
753 int longflag = 0;
754 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 zeropad = (*f == '0');
756 /* parse the width.precision part */
757 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000758 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 width = (width*10) + *f++ - '0';
760 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 if (*f == '.') {
762 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000764 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 /* handle the long flag, but only for %ld and %lu.
767 others can be added when necessary. */
768 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
769 longflag = 1;
770 ++f;
771 }
772 /* handle the size_t flag. */
773 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
774 size_tflag = 1;
775 ++f;
776 }
777
778 switch (*f) {
779 case 'c':
780 *s++ = va_arg(vargs, int);
781 break;
782 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000783 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000784 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000785 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000786 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000787 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000788 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000789 sprintf(realbuffer, fmt, va_arg(vargs, int));
790 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000791 break;
792 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000793 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000795 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000796 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000797 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000798 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000799 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
800 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000801 break;
802 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000803 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
804 sprintf(realbuffer, fmt, va_arg(vargs, int));
805 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000806 break;
807 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000808 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
809 sprintf(realbuffer, fmt, va_arg(vargs, int));
810 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000811 break;
812 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000813 {
814 /* Parameter must be UTF-8 encoded.
815 In case of encoding errors, use
816 the replacement character. */
817 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000819 u = PyUnicode_DecodeUTF8(p, strlen(p),
820 "replace");
821 if (!u)
822 goto fail;
823 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
824 PyUnicode_GET_SIZE(u));
825 s += PyUnicode_GET_SIZE(u);
826 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000827 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000828 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000829 case 'U':
830 {
831 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000832 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
833 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
834 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000835 break;
836 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000837 case 'V':
838 {
839 PyObject *obj = va_arg(vargs, PyObject *);
840 const char *str = va_arg(vargs, const char *);
841 if (obj) {
842 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
843 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
844 s += size;
845 } else {
846 appendstring(str);
847 }
848 break;
849 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000850 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 case 'R':
852 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000853 Py_UNICODE *ucopy;
854 Py_ssize_t usize;
855 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000856 /* unused, since we already have the result */
857 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000858 ucopy = PyUnicode_AS_UNICODE(*callresult);
859 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000860 for (upos = 0; upos<usize;)
861 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000862 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000863 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000864 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000865 ++callresult;
866 break;
867 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000868 case 'p':
869 sprintf(buffer, "%p", va_arg(vargs, void*));
870 /* %p is ill-defined: ensure leading 0x. */
871 if (buffer[1] == 'X')
872 buffer[1] = 'x';
873 else if (buffer[1] != 'x') {
874 memmove(buffer+2, buffer, strlen(buffer)+1);
875 buffer[0] = '0';
876 buffer[1] = 'x';
877 }
878 appendstring(buffer);
879 break;
880 case '%':
881 *s++ = '%';
882 break;
883 default:
884 appendstring(p);
885 goto end;
886 }
887 } else
888 *s++ = *f;
889 }
890
891 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000892 if (callresults)
893 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000894 if (abuffer)
895 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000896 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
897 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000898 fail:
899 if (callresults) {
900 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000901 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000902 Py_DECREF(*callresult2);
903 ++callresult2;
904 }
905 PyMem_Free(callresults);
906 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000907 if (abuffer)
908 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000909 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000910}
911
912#undef appendstring
913
914PyObject *
915PyUnicode_FromFormat(const char *format, ...)
916{
917 PyObject* ret;
918 va_list vargs;
919
920#ifdef HAVE_STDARG_PROTOTYPES
921 va_start(vargs, format);
922#else
923 va_start(vargs);
924#endif
925 ret = PyUnicode_FromFormatV(format, vargs);
926 va_end(vargs);
927 return ret;
928}
929
Martin v. Löwis18e16552006-02-15 17:27:45 +0000930Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
931 wchar_t *w,
932 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933{
934 if (unicode == NULL) {
935 PyErr_BadInternalCall();
936 return -1;
937 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000938
939 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000941 size = PyUnicode_GET_SIZE(unicode) + 1;
942
Guido van Rossumd57fd912000-03-10 22:53:23 +0000943#ifdef HAVE_USABLE_WCHAR_T
944 memcpy(w, unicode->str, size * sizeof(wchar_t));
945#else
946 {
947 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000948 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000950 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951 *w++ = *u++;
952 }
953#endif
954
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000955 if (size > PyUnicode_GET_SIZE(unicode))
956 return PyUnicode_GET_SIZE(unicode);
957 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000958 return size;
959}
960
961#endif
962
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000963PyObject *PyUnicode_FromOrdinal(int ordinal)
964{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000965 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000966
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000967 if (ordinal < 0 || ordinal > 0x10ffff) {
968 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000969 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000970 return NULL;
971 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000972
973#ifndef Py_UNICODE_WIDE
974 if (ordinal > 0xffff) {
975 ordinal -= 0x10000;
976 s[0] = 0xD800 | (ordinal >> 10);
977 s[1] = 0xDC00 | (ordinal & 0x3FF);
978 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000979 }
980#endif
981
Hye-Shik Chang40574832004-04-06 07:24:51 +0000982 s[0] = (Py_UNICODE)ordinal;
983 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000984}
985
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986PyObject *PyUnicode_FromObject(register PyObject *obj)
987{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000988 /* XXX Perhaps we should make this API an alias of
989 PyObject_Unicode() instead ?! */
990 if (PyUnicode_CheckExact(obj)) {
991 Py_INCREF(obj);
992 return obj;
993 }
994 if (PyUnicode_Check(obj)) {
995 /* For a Unicode subtype that's not a Unicode object,
996 return a true Unicode object with the same data. */
997 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
998 PyUnicode_GET_SIZE(obj));
999 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001000 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1001}
1002
1003PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1004 const char *encoding,
1005 const char *errors)
1006{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001007 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001008 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001009 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001010
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 if (obj == NULL) {
1012 PyErr_BadInternalCall();
1013 return NULL;
1014 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001015
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001016 if (PyUnicode_Check(obj)) {
1017 PyErr_SetString(PyExc_TypeError,
1018 "decoding Unicode is not supported");
1019 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001020 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001021
1022 /* Coerce object */
1023 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001024 s = PyString_AS_STRING(obj);
1025 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001026 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001027 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1028 /* Overwrite the error message with something more useful in
1029 case of a TypeError. */
1030 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001031 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001032 "coercing to Unicode: need string or buffer, "
1033 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001034 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001035 goto onError;
1036 }
Tim Petersced69f82003-09-16 20:30:58 +00001037
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001038 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 if (len == 0) {
1040 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001041 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 }
Tim Petersced69f82003-09-16 20:30:58 +00001043 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001044 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001045
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001046 return v;
1047
1048 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050}
1051
1052PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001053 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 const char *encoding,
1055 const char *errors)
1056{
1057 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001058 Py_buffer info;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059
1060 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001061 encoding = PyUnicode_GetDefaultEncoding();
1062
1063 /* Shortcuts for common default encodings */
1064 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001066 else if (strcmp(encoding, "latin-1") == 0)
1067 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001068#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1069 else if (strcmp(encoding, "mbcs") == 0)
1070 return PyUnicode_DecodeMBCS(s, size, errors);
1071#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001072 else if (strcmp(encoding, "ascii") == 0)
1073 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074
1075 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001076 buffer = NULL;
1077 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1078 goto onError;
1079 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 if (buffer == NULL)
1081 goto onError;
1082 unicode = PyCodec_Decode(buffer, encoding, errors);
1083 if (unicode == NULL)
1084 goto onError;
1085 if (!PyUnicode_Check(unicode)) {
1086 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001087 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001088 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089 Py_DECREF(unicode);
1090 goto onError;
1091 }
1092 Py_DECREF(buffer);
1093 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001094
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 onError:
1096 Py_XDECREF(buffer);
1097 return NULL;
1098}
1099
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001100PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1101 const char *encoding,
1102 const char *errors)
1103{
1104 PyObject *v;
1105
1106 if (!PyUnicode_Check(unicode)) {
1107 PyErr_BadArgument();
1108 goto onError;
1109 }
1110
1111 if (encoding == NULL)
1112 encoding = PyUnicode_GetDefaultEncoding();
1113
1114 /* Decode via the codec registry */
1115 v = PyCodec_Decode(unicode, encoding, errors);
1116 if (v == NULL)
1117 goto onError;
1118 return v;
1119
1120 onError:
1121 return NULL;
1122}
1123
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001125 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126 const char *encoding,
1127 const char *errors)
1128{
1129 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001130
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131 unicode = PyUnicode_FromUnicode(s, size);
1132 if (unicode == NULL)
1133 return NULL;
1134 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1135 Py_DECREF(unicode);
1136 return v;
1137}
1138
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001139PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1140 const char *encoding,
1141 const char *errors)
1142{
1143 PyObject *v;
1144
1145 if (!PyUnicode_Check(unicode)) {
1146 PyErr_BadArgument();
1147 goto onError;
1148 }
1149
1150 if (encoding == NULL)
1151 encoding = PyUnicode_GetDefaultEncoding();
1152
1153 /* Encode via the codec registry */
1154 v = PyCodec_Encode(unicode, encoding, errors);
1155 if (v == NULL)
1156 goto onError;
1157 return v;
1158
1159 onError:
1160 return NULL;
1161}
1162
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1164 const char *encoding,
1165 const char *errors)
1166{
1167 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001168
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 if (!PyUnicode_Check(unicode)) {
1170 PyErr_BadArgument();
1171 goto onError;
1172 }
Fred Drakee4315f52000-05-09 19:53:39 +00001173
Tim Petersced69f82003-09-16 20:30:58 +00001174 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001175 encoding = PyUnicode_GetDefaultEncoding();
1176
1177 /* Shortcuts for common default encodings */
1178 if (errors == NULL) {
1179 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001180 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001181 else if (strcmp(encoding, "latin-1") == 0)
1182 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001183#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1184 else if (strcmp(encoding, "mbcs") == 0)
1185 return PyUnicode_AsMBCSString(unicode);
1186#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001187 else if (strcmp(encoding, "ascii") == 0)
1188 return PyUnicode_AsASCIIString(unicode);
1189 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 /* Encode via the codec registry */
1192 v = PyCodec_Encode(unicode, encoding, errors);
1193 if (v == NULL)
1194 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001195 if (!PyBytes_Check(v)) {
1196 if (PyString_Check(v)) {
1197 /* Old codec, turn it into bytes */
1198 PyObject *b = PyBytes_FromObject(v);
1199 Py_DECREF(v);
1200 return b;
1201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001203 "encoder did not return a bytes object "
1204 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1205 v->ob_type->tp_name,
1206 encoding ? encoding : "NULL",
1207 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 Py_DECREF(v);
1209 goto onError;
1210 }
1211 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001212
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 onError:
1214 return NULL;
1215}
1216
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001217PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1218 const char *errors)
1219{
1220 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001221 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001222 if (v)
1223 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001224 if (errors != NULL)
1225 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001226 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1227 PyUnicode_GET_SIZE(unicode),
1228 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001229 if (!b)
1230 return NULL;
1231 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1232 PyBytes_Size(b));
1233 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001234 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001235 return v;
1236}
1237
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001238PyObject*
1239PyUnicode_DecodeFSDefault(const char *s)
1240{
1241 Py_ssize_t size = (Py_ssize_t)strlen(s);
1242
1243 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1244 can be undefined. If it is case, decode using UTF-8. The following assumes
1245 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1246 bootstrapping process where the codecs aren't ready yet.
1247 */
1248 if (Py_FileSystemDefaultEncoding) {
1249#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1250 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs")) {
1251 return PyUnicode_DecodeMBCS(s, size, "replace");
1252 }
1253#elif defined(__APPLE__)
1254 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8")) {
1255 return PyUnicode_DecodeUTF8(s, size, "replace");
1256 }
1257#endif
1258 return PyUnicode_Decode(s, size,
1259 Py_FileSystemDefaultEncoding,
1260 "replace");
1261 }
1262 else {
1263 return PyUnicode_DecodeUTF8(s, size, "replace");
1264 }
1265}
1266
Martin v. Löwis5b222132007-06-10 09:51:05 +00001267char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001268PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001269{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001270 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001271 if (!PyUnicode_Check(unicode)) {
1272 PyErr_BadArgument();
1273 return NULL;
1274 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001275 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1276 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001277 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001278 if (psize != NULL)
1279 *psize = PyString_GET_SIZE(str8);
1280 return PyString_AS_STRING(str8);
1281}
1282
1283char*
1284PyUnicode_AsString(PyObject *unicode)
1285{
1286 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001287}
1288
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1290{
1291 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument();
1293 goto onError;
1294 }
1295 return PyUnicode_AS_UNICODE(unicode);
1296
1297 onError:
1298 return NULL;
1299}
1300
Martin v. Löwis18e16552006-02-15 17:27:45 +00001301Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302{
1303 if (!PyUnicode_Check(unicode)) {
1304 PyErr_BadArgument();
1305 goto onError;
1306 }
1307 return PyUnicode_GET_SIZE(unicode);
1308
1309 onError:
1310 return -1;
1311}
1312
Thomas Wouters78890102000-07-22 19:25:51 +00001313const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001314{
1315 return unicode_default_encoding;
1316}
1317
1318int PyUnicode_SetDefaultEncoding(const char *encoding)
1319{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001320 if (strcmp(encoding, unicode_default_encoding) != 0) {
1321 PyErr_Format(PyExc_ValueError,
1322 "Can only set default encoding to %s",
1323 unicode_default_encoding);
1324 return -1;
1325 }
Fred Drakee4315f52000-05-09 19:53:39 +00001326 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001327}
1328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329/* error handling callback helper:
1330 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001331 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 and adjust various state variables.
1333 return 0 on success, -1 on error
1334*/
1335
1336static
1337int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1338 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001339 const char **input, const char **inend, Py_ssize_t *startinpos,
1340 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001341 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001342{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001343 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344
1345 PyObject *restuple = NULL;
1346 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001347 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001348 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001349 Py_ssize_t requiredsize;
1350 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001352 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001353 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 int res = -1;
1355
1356 if (*errorHandler == NULL) {
1357 *errorHandler = PyCodec_LookupError(errors);
1358 if (*errorHandler == NULL)
1359 goto onError;
1360 }
1361
1362 if (*exceptionObject == NULL) {
1363 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001364 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001365 if (*exceptionObject == NULL)
1366 goto onError;
1367 }
1368 else {
1369 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1370 goto onError;
1371 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1372 goto onError;
1373 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1374 goto onError;
1375 }
1376
1377 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1378 if (restuple == NULL)
1379 goto onError;
1380 if (!PyTuple_Check(restuple)) {
1381 PyErr_Format(PyExc_TypeError, &argparse[4]);
1382 goto onError;
1383 }
1384 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1385 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001386
1387 /* Copy back the bytes variables, which might have been modified by the
1388 callback */
1389 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1390 if (!inputobj)
1391 goto onError;
1392 if (!PyBytes_Check(inputobj)) {
1393 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1394 }
1395 *input = PyBytes_AS_STRING(inputobj);
1396 insize = PyBytes_GET_SIZE(inputobj);
1397 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001398 /* we can DECREF safely, as the exception has another reference,
1399 so the object won't go away. */
1400 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001401
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001402 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001403 newpos = insize+newpos;
1404 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001405 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001406 goto onError;
1407 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408
1409 /* need more space? (at least enough for what we
1410 have+the replacement+the rest of the string (starting
1411 at the new input position), so we won't have to check space
1412 when there are no errors in the rest of the string) */
1413 repptr = PyUnicode_AS_UNICODE(repunicode);
1414 repsize = PyUnicode_GET_SIZE(repunicode);
1415 requiredsize = *outpos + repsize + insize-newpos;
1416 if (requiredsize > outsize) {
1417 if (requiredsize<2*outsize)
1418 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001419 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 goto onError;
1421 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1422 }
1423 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001424 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001425 Py_UNICODE_COPY(*outptr, repptr, repsize);
1426 *outptr += repsize;
1427 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 /* we made it! */
1430 res = 0;
1431
1432 onError:
1433 Py_XDECREF(restuple);
1434 return res;
1435}
1436
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001437/* --- UTF-7 Codec -------------------------------------------------------- */
1438
1439/* see RFC2152 for details */
1440
Tim Petersced69f82003-09-16 20:30:58 +00001441static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001442char utf7_special[128] = {
1443 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1444 encoded:
1445 0 - not special
1446 1 - special
1447 2 - whitespace (optional)
1448 3 - RFC2152 Set O (optional) */
1449 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1450 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1451 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1452 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1453 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1454 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1455 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1456 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1457
1458};
1459
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001460/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1461 warnings about the comparison always being false; since
1462 utf7_special[0] is 1, we can safely make that one comparison
1463 true */
1464
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001466 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001467 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468 (encodeO && (utf7_special[(c)] == 3)))
1469
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001470#define B64(n) \
1471 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1472#define B64CHAR(c) \
1473 (isalnum(c) || (c) == '+' || (c) == '/')
1474#define UB64(c) \
1475 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1476 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001478#define ENCODE(out, ch, bits) \
1479 while (bits >= 6) { \
1480 *out++ = B64(ch >> (bits-6)); \
1481 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482 }
1483
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001484#define DECODE(out, ch, bits, surrogate) \
1485 while (bits >= 16) { \
1486 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1487 bits -= 16; \
1488 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001489 /* We have already generated an error for the high surrogate \
1490 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001491 surrogate = 0; \
1492 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001494 it in a 16-bit character */ \
1495 surrogate = 1; \
1496 errmsg = "code pairs are not supported"; \
1497 goto utf7Error; \
1498 } else { \
1499 *out++ = outCh; \
1500 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001501 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001502
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001504 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001505 const char *errors)
1506{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001508 Py_ssize_t startinpos;
1509 Py_ssize_t endinpos;
1510 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001511 const char *e;
1512 PyUnicodeObject *unicode;
1513 Py_UNICODE *p;
1514 const char *errmsg = "";
1515 int inShift = 0;
1516 unsigned int bitsleft = 0;
1517 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001518 int surrogate = 0;
1519 PyObject *errorHandler = NULL;
1520 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001521
1522 unicode = _PyUnicode_New(size);
1523 if (!unicode)
1524 return NULL;
1525 if (size == 0)
1526 return (PyObject *)unicode;
1527
1528 p = unicode->str;
1529 e = s + size;
1530
1531 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001532 Py_UNICODE ch;
1533 restart:
1534 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535
1536 if (inShift) {
1537 if ((ch == '-') || !B64CHAR(ch)) {
1538 inShift = 0;
1539 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001540
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001541 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1542 if (bitsleft >= 6) {
1543 /* The shift sequence has a partial character in it. If
1544 bitsleft < 6 then we could just classify it as padding
1545 but that is not the case here */
1546
1547 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001548 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 }
1550 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001551 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001552 here so indicate the potential of a misencoded character. */
1553
1554 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1555 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1556 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001557 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 }
1559
1560 if (ch == '-') {
1561 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001562 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 inShift = 1;
1564 }
1565 } else if (SPECIAL(ch,0,0)) {
1566 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001567 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568 } else {
1569 *p++ = ch;
1570 }
1571 } else {
1572 charsleft = (charsleft << 6) | UB64(ch);
1573 bitsleft += 6;
1574 s++;
1575 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1576 }
1577 }
1578 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580 s++;
1581 if (s < e && *s == '-') {
1582 s++;
1583 *p++ = '+';
1584 } else
1585 {
1586 inShift = 1;
1587 bitsleft = 0;
1588 }
1589 }
1590 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001591 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592 errmsg = "unexpected special character";
1593 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001594 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 }
1596 else {
1597 *p++ = ch;
1598 s++;
1599 }
1600 continue;
1601 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 outpos = p-PyUnicode_AS_UNICODE(unicode);
1603 endinpos = s-starts;
1604 if (unicode_decode_call_errorhandler(
1605 errors, &errorHandler,
1606 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001607 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001608 (PyObject **)&unicode, &outpos, &p))
1609 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 }
1611
1612 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 outpos = p-PyUnicode_AS_UNICODE(unicode);
1614 endinpos = size;
1615 if (unicode_decode_call_errorhandler(
1616 errors, &errorHandler,
1617 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001618 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 if (s < e)
1622 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623 }
1624
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001625 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 goto onError;
1627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_XDECREF(errorHandler);
1629 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630 return (PyObject *)unicode;
1631
1632onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 Py_XDECREF(errorHandler);
1634 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 Py_DECREF(unicode);
1636 return NULL;
1637}
1638
1639
1640PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001641 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 int encodeSetO,
1643 int encodeWhiteSpace,
1644 const char *errors)
1645{
1646 PyObject *v;
1647 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001648 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001650 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 unsigned int bitsleft = 0;
1652 unsigned long charsleft = 0;
1653 char * out;
1654 char * start;
1655
1656 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001657 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658
Walter Dörwald51ab4142007-05-05 14:43:36 +00001659 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001660 if (v == NULL)
1661 return NULL;
1662
Walter Dörwald51ab4142007-05-05 14:43:36 +00001663 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 for (;i < size; ++i) {
1665 Py_UNICODE ch = s[i];
1666
1667 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001668 if (ch == '+') {
1669 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 *out++ = '-';
1671 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1672 charsleft = ch;
1673 bitsleft = 16;
1674 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001675 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001677 } else {
1678 *out++ = (char) ch;
1679 }
1680 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1682 *out++ = B64(charsleft << (6-bitsleft));
1683 charsleft = 0;
1684 bitsleft = 0;
1685 /* Characters not in the BASE64 set implicitly unshift the sequence
1686 so no '-' is required, except if the character is itself a '-' */
1687 if (B64CHAR(ch) || ch == '-') {
1688 *out++ = '-';
1689 }
1690 inShift = 0;
1691 *out++ = (char) ch;
1692 } else {
1693 bitsleft += 16;
1694 charsleft = (charsleft << 16) | ch;
1695 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1696
1697 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001698 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001699 or '-' then the shift sequence will be terminated implicitly and we
1700 don't have to insert a '-'. */
1701
1702 if (bitsleft == 0) {
1703 if (i + 1 < size) {
1704 Py_UNICODE ch2 = s[i+1];
1705
1706 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001707
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 } else if (B64CHAR(ch2) || ch2 == '-') {
1709 *out++ = '-';
1710 inShift = 0;
1711 } else {
1712 inShift = 0;
1713 }
1714
1715 }
1716 else {
1717 *out++ = '-';
1718 inShift = 0;
1719 }
1720 }
Tim Petersced69f82003-09-16 20:30:58 +00001721 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001722 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001723 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001724 if (bitsleft) {
1725 *out++= B64(charsleft << (6-bitsleft) );
1726 *out++ = '-';
1727 }
1728
Walter Dörwald51ab4142007-05-05 14:43:36 +00001729 if (PyBytes_Resize(v, out - start)) {
1730 Py_DECREF(v);
1731 return NULL;
1732 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001733 return v;
1734}
1735
1736#undef SPECIAL
1737#undef B64
1738#undef B64CHAR
1739#undef UB64
1740#undef ENCODE
1741#undef DECODE
1742
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743/* --- UTF-8 Codec -------------------------------------------------------- */
1744
Tim Petersced69f82003-09-16 20:30:58 +00001745static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746char utf8_code_length[256] = {
1747 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1748 illegal prefix. see RFC 2279 for details */
1749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1753 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1755 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1758 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1759 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1760 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1761 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1762 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1763 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1764 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1765};
1766
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001768 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 const char *errors)
1770{
Walter Dörwald69652032004-09-07 20:24:22 +00001771 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1772}
1773
1774PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001775 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001776 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001777 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001778{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001781 Py_ssize_t startinpos;
1782 Py_ssize_t endinpos;
1783 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 const char *e;
1785 PyUnicodeObject *unicode;
1786 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001787 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001788 PyObject *errorHandler = NULL;
1789 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790
1791 /* Note: size will always be longer than the resulting Unicode
1792 character count */
1793 unicode = _PyUnicode_New(size);
1794 if (!unicode)
1795 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001796 if (size == 0) {
1797 if (consumed)
1798 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801
1802 /* Unpack UTF-8 encoded data */
1803 p = unicode->str;
1804 e = s + size;
1805
1806 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001807 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808
1809 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001810 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 s++;
1812 continue;
1813 }
1814
1815 n = utf8_code_length[ch];
1816
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001818 if (consumed)
1819 break;
1820 else {
1821 errmsg = "unexpected end of data";
1822 startinpos = s-starts;
1823 endinpos = size;
1824 goto utf8Error;
1825 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827
1828 switch (n) {
1829
1830 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 startinpos = s-starts;
1833 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835
1836 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001838 startinpos = s-starts;
1839 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841
1842 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001843 if ((s[1] & 0xc0) != 0x80) {
1844 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 startinpos = s-starts;
1846 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 goto utf8Error;
1848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001851 startinpos = s-starts;
1852 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 errmsg = "illegal encoding";
1854 goto utf8Error;
1855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 break;
1859
1860 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001861 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001862 (s[2] & 0xc0) != 0x80) {
1863 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 startinpos = s-starts;
1865 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001866 goto utf8Error;
1867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001869 if (ch < 0x0800) {
1870 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001871 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001872
1873 XXX For wide builds (UCS-4) we should probably try
1874 to recombine the surrogates into a single code
1875 unit.
1876 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001877 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 startinpos = s-starts;
1879 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 goto utf8Error;
1881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001883 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001884 break;
1885
1886 case 4:
1887 if ((s[1] & 0xc0) != 0x80 ||
1888 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001889 (s[3] & 0xc0) != 0x80) {
1890 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 startinpos = s-starts;
1892 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 goto utf8Error;
1894 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001895 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1896 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1897 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001898 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001899 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001900 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001901 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001902 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001903 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 startinpos = s-starts;
1905 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 goto utf8Error;
1907 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001908#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001909 *p++ = (Py_UNICODE)ch;
1910#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001911 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001912
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001913 /* translate from 10000..10FFFF to 0..FFFF */
1914 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001915
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 /* high surrogate = top 10 bits added to D800 */
1917 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001918
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001919 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001920 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001921#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 break;
1923
1924 default:
1925 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001926 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 startinpos = s-starts;
1928 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001929 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 }
1931 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001932 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001933
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001934 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 outpos = p-PyUnicode_AS_UNICODE(unicode);
1936 if (unicode_decode_call_errorhandler(
1937 errors, &errorHandler,
1938 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001939 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 (PyObject **)&unicode, &outpos, &p))
1941 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 }
Walter Dörwald69652032004-09-07 20:24:22 +00001943 if (consumed)
1944 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945
1946 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001947 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948 goto onError;
1949
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001950 Py_XDECREF(errorHandler);
1951 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 return (PyObject *)unicode;
1953
1954onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001955 Py_XDECREF(errorHandler);
1956 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 Py_DECREF(unicode);
1958 return NULL;
1959}
1960
Tim Peters602f7402002-04-27 18:03:26 +00001961/* Allocation strategy: if the string is short, convert into a stack buffer
1962 and allocate exactly as much space needed at the end. Else allocate the
1963 maximum possible needed (4 result bytes per Unicode character), and return
1964 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001965*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001966PyObject *
1967PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001968 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001969 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970{
Tim Peters602f7402002-04-27 18:03:26 +00001971#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001972
Martin v. Löwis18e16552006-02-15 17:27:45 +00001973 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001974 PyObject *v; /* result string object */
1975 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001976 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001977 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001978 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001979
Tim Peters602f7402002-04-27 18:03:26 +00001980 assert(s != NULL);
1981 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982
Tim Peters602f7402002-04-27 18:03:26 +00001983 if (size <= MAX_SHORT_UNICHARS) {
1984 /* Write into the stack buffer; nallocated can't overflow.
1985 * At the end, we'll allocate exactly as much heap space as it
1986 * turns out we need.
1987 */
1988 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1989 v = NULL; /* will allocate after we're done */
1990 p = stackbuf;
1991 }
1992 else {
1993 /* Overallocate on the heap, and give the excess back at the end. */
1994 nallocated = size * 4;
1995 if (nallocated / 4 != size) /* overflow! */
1996 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001997 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001998 if (v == NULL)
1999 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002000 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002001 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002002
Tim Peters602f7402002-04-27 18:03:26 +00002003 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002004 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002005
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002006 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002007 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002009
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002011 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002012 *p++ = (char)(0xc0 | (ch >> 6));
2013 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002014 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002015 else {
Tim Peters602f7402002-04-27 18:03:26 +00002016 /* Encode UCS2 Unicode ordinals */
2017 if (ch < 0x10000) {
2018 /* Special case: check for high surrogate */
2019 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2020 Py_UCS4 ch2 = s[i];
2021 /* Check for low surrogate and combine the two to
2022 form a UCS4 value */
2023 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002024 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002025 i++;
2026 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002027 }
Tim Peters602f7402002-04-27 18:03:26 +00002028 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002029 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002031 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2032 *p++ = (char)(0x80 | (ch & 0x3f));
2033 continue;
2034 }
2035encodeUCS4:
2036 /* Encode UCS4 Unicode ordinals */
2037 *p++ = (char)(0xf0 | (ch >> 18));
2038 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2039 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2040 *p++ = (char)(0x80 | (ch & 0x3f));
2041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002043
Tim Peters602f7402002-04-27 18:03:26 +00002044 if (v == NULL) {
2045 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002046 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002047 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002048 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002049 }
2050 else {
2051 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002052 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002053 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002054 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002057
Tim Peters602f7402002-04-27 18:03:26 +00002058#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059}
2060
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2062{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 if (!PyUnicode_Check(unicode)) {
2064 PyErr_BadArgument();
2065 return NULL;
2066 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002067 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2068 PyUnicode_GET_SIZE(unicode),
2069 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070}
2071
Walter Dörwald41980ca2007-08-16 21:55:45 +00002072/* --- UTF-32 Codec ------------------------------------------------------- */
2073
2074PyObject *
2075PyUnicode_DecodeUTF32(const char *s,
2076 Py_ssize_t size,
2077 const char *errors,
2078 int *byteorder)
2079{
2080 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2081}
2082
2083PyObject *
2084PyUnicode_DecodeUTF32Stateful(const char *s,
2085 Py_ssize_t size,
2086 const char *errors,
2087 int *byteorder,
2088 Py_ssize_t *consumed)
2089{
2090 const char *starts = s;
2091 Py_ssize_t startinpos;
2092 Py_ssize_t endinpos;
2093 Py_ssize_t outpos;
2094 PyUnicodeObject *unicode;
2095 Py_UNICODE *p;
2096#ifndef Py_UNICODE_WIDE
2097 int i, pairs;
2098#else
2099 const int pairs = 0;
2100#endif
2101 const unsigned char *q, *e;
2102 int bo = 0; /* assume native ordering by default */
2103 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002104 /* Offsets from q for retrieving bytes in the right order. */
2105#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2106 int iorder[] = {0, 1, 2, 3};
2107#else
2108 int iorder[] = {3, 2, 1, 0};
2109#endif
2110 PyObject *errorHandler = NULL;
2111 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002112 /* On narrow builds we split characters outside the BMP into two
2113 codepoints => count how much extra space we need. */
2114#ifndef Py_UNICODE_WIDE
2115 for (i = pairs = 0; i < size/4; i++)
2116 if (((Py_UCS4 *)s)[i] >= 0x10000)
2117 pairs++;
2118#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002119
2120 /* This might be one to much, because of a BOM */
2121 unicode = _PyUnicode_New((size+3)/4+pairs);
2122 if (!unicode)
2123 return NULL;
2124 if (size == 0)
2125 return (PyObject *)unicode;
2126
2127 /* Unpack UTF-32 encoded data */
2128 p = unicode->str;
2129 q = (unsigned char *)s;
2130 e = q + size;
2131
2132 if (byteorder)
2133 bo = *byteorder;
2134
2135 /* Check for BOM marks (U+FEFF) in the input and adjust current
2136 byte order setting accordingly. In native mode, the leading BOM
2137 mark is skipped, in all other modes, it is copied to the output
2138 stream as-is (giving a ZWNBSP character). */
2139 if (bo == 0) {
2140 if (size >= 4) {
2141 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2142 (q[iorder[1]] << 8) | q[iorder[0]];
2143#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2144 if (bom == 0x0000FEFF) {
2145 q += 4;
2146 bo = -1;
2147 }
2148 else if (bom == 0xFFFE0000) {
2149 q += 4;
2150 bo = 1;
2151 }
2152#else
2153 if (bom == 0x0000FEFF) {
2154 q += 4;
2155 bo = 1;
2156 }
2157 else if (bom == 0xFFFE0000) {
2158 q += 4;
2159 bo = -1;
2160 }
2161#endif
2162 }
2163 }
2164
2165 if (bo == -1) {
2166 /* force LE */
2167 iorder[0] = 0;
2168 iorder[1] = 1;
2169 iorder[2] = 2;
2170 iorder[3] = 3;
2171 }
2172 else if (bo == 1) {
2173 /* force BE */
2174 iorder[0] = 3;
2175 iorder[1] = 2;
2176 iorder[2] = 1;
2177 iorder[3] = 0;
2178 }
2179
2180 while (q < e) {
2181 Py_UCS4 ch;
2182 /* remaining bytes at the end? (size should be divisible by 4) */
2183 if (e-q<4) {
2184 if (consumed)
2185 break;
2186 errmsg = "truncated data";
2187 startinpos = ((const char *)q)-starts;
2188 endinpos = ((const char *)e)-starts;
2189 goto utf32Error;
2190 /* The remaining input chars are ignored if the callback
2191 chooses to skip the input */
2192 }
2193 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2194 (q[iorder[1]] << 8) | q[iorder[0]];
2195
2196 if (ch >= 0x110000)
2197 {
2198 errmsg = "codepoint not in range(0x110000)";
2199 startinpos = ((const char *)q)-starts;
2200 endinpos = startinpos+4;
2201 goto utf32Error;
2202 }
2203#ifndef Py_UNICODE_WIDE
2204 if (ch >= 0x10000)
2205 {
2206 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2207 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2208 }
2209 else
2210#endif
2211 *p++ = ch;
2212 q += 4;
2213 continue;
2214 utf32Error:
2215 outpos = p-PyUnicode_AS_UNICODE(unicode);
2216 if (unicode_decode_call_errorhandler(
2217 errors, &errorHandler,
2218 "utf32", errmsg,
2219 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2220 (PyObject **)&unicode, &outpos, &p))
2221 goto onError;
2222 }
2223
2224 if (byteorder)
2225 *byteorder = bo;
2226
2227 if (consumed)
2228 *consumed = (const char *)q-starts;
2229
2230 /* Adjust length */
2231 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2232 goto onError;
2233
2234 Py_XDECREF(errorHandler);
2235 Py_XDECREF(exc);
2236 return (PyObject *)unicode;
2237
2238onError:
2239 Py_DECREF(unicode);
2240 Py_XDECREF(errorHandler);
2241 Py_XDECREF(exc);
2242 return NULL;
2243}
2244
2245PyObject *
2246PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2247 Py_ssize_t size,
2248 const char *errors,
2249 int byteorder)
2250{
2251 PyObject *v;
2252 unsigned char *p;
2253#ifndef Py_UNICODE_WIDE
2254 int i, pairs;
2255#else
2256 const int pairs = 0;
2257#endif
2258 /* Offsets from p for storing byte pairs in the right order. */
2259#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2260 int iorder[] = {0, 1, 2, 3};
2261#else
2262 int iorder[] = {3, 2, 1, 0};
2263#endif
2264
2265#define STORECHAR(CH) \
2266 do { \
2267 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2268 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2269 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2270 p[iorder[0]] = (CH) & 0xff; \
2271 p += 4; \
2272 } while(0)
2273
2274 /* In narrow builds we can output surrogate pairs as one codepoint,
2275 so we need less space. */
2276#ifndef Py_UNICODE_WIDE
2277 for (i = pairs = 0; i < size-1; i++)
2278 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2279 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2280 pairs++;
2281#endif
2282 v = PyBytes_FromStringAndSize(NULL,
2283 4 * (size - pairs + (byteorder == 0)));
2284 if (v == NULL)
2285 return NULL;
2286
2287 p = (unsigned char *)PyBytes_AS_STRING(v);
2288 if (byteorder == 0)
2289 STORECHAR(0xFEFF);
2290 if (size == 0)
2291 return v;
2292
2293 if (byteorder == -1) {
2294 /* force LE */
2295 iorder[0] = 0;
2296 iorder[1] = 1;
2297 iorder[2] = 2;
2298 iorder[3] = 3;
2299 }
2300 else if (byteorder == 1) {
2301 /* force BE */
2302 iorder[0] = 3;
2303 iorder[1] = 2;
2304 iorder[2] = 1;
2305 iorder[3] = 0;
2306 }
2307
2308 while (size-- > 0) {
2309 Py_UCS4 ch = *s++;
2310#ifndef Py_UNICODE_WIDE
2311 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2312 Py_UCS4 ch2 = *s;
2313 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2314 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2315 s++;
2316 size--;
2317 }
2318 }
2319#endif
2320 STORECHAR(ch);
2321 }
2322 return v;
2323#undef STORECHAR
2324}
2325
2326PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2327{
2328 if (!PyUnicode_Check(unicode)) {
2329 PyErr_BadArgument();
2330 return NULL;
2331 }
2332 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2333 PyUnicode_GET_SIZE(unicode),
2334 NULL,
2335 0);
2336}
2337
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338/* --- UTF-16 Codec ------------------------------------------------------- */
2339
Tim Peters772747b2001-08-09 22:21:55 +00002340PyObject *
2341PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002342 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002343 const char *errors,
2344 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345{
Walter Dörwald69652032004-09-07 20:24:22 +00002346 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2347}
2348
2349PyObject *
2350PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002351 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002352 const char *errors,
2353 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002354 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002355{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002356 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002357 Py_ssize_t startinpos;
2358 Py_ssize_t endinpos;
2359 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 PyUnicodeObject *unicode;
2361 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002362 const unsigned char *q, *e;
2363 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002364 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002365 /* Offsets from q for retrieving byte pairs in the right order. */
2366#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2367 int ihi = 1, ilo = 0;
2368#else
2369 int ihi = 0, ilo = 1;
2370#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 PyObject *errorHandler = NULL;
2372 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
2374 /* Note: size will always be longer than the resulting Unicode
2375 character count */
2376 unicode = _PyUnicode_New(size);
2377 if (!unicode)
2378 return NULL;
2379 if (size == 0)
2380 return (PyObject *)unicode;
2381
2382 /* Unpack UTF-16 encoded data */
2383 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002384 q = (unsigned char *)s;
2385 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386
2387 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002388 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002390 /* Check for BOM marks (U+FEFF) in the input and adjust current
2391 byte order setting accordingly. In native mode, the leading BOM
2392 mark is skipped, in all other modes, it is copied to the output
2393 stream as-is (giving a ZWNBSP character). */
2394 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002395 if (size >= 2) {
2396 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002397#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002398 if (bom == 0xFEFF) {
2399 q += 2;
2400 bo = -1;
2401 }
2402 else if (bom == 0xFFFE) {
2403 q += 2;
2404 bo = 1;
2405 }
Tim Petersced69f82003-09-16 20:30:58 +00002406#else
Walter Dörwald69652032004-09-07 20:24:22 +00002407 if (bom == 0xFEFF) {
2408 q += 2;
2409 bo = 1;
2410 }
2411 else if (bom == 0xFFFE) {
2412 q += 2;
2413 bo = -1;
2414 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002415#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002416 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418
Tim Peters772747b2001-08-09 22:21:55 +00002419 if (bo == -1) {
2420 /* force LE */
2421 ihi = 1;
2422 ilo = 0;
2423 }
2424 else if (bo == 1) {
2425 /* force BE */
2426 ihi = 0;
2427 ilo = 1;
2428 }
2429
2430 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002431 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002432 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002433 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002434 if (consumed)
2435 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002436 errmsg = "truncated data";
2437 startinpos = ((const char *)q)-starts;
2438 endinpos = ((const char *)e)-starts;
2439 goto utf16Error;
2440 /* The remaining input chars are ignored if the callback
2441 chooses to skip the input */
2442 }
2443 ch = (q[ihi] << 8) | q[ilo];
2444
Tim Peters772747b2001-08-09 22:21:55 +00002445 q += 2;
2446
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 if (ch < 0xD800 || ch > 0xDFFF) {
2448 *p++ = ch;
2449 continue;
2450 }
2451
2452 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002453 if (q >= e) {
2454 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002455 startinpos = (((const char *)q)-2)-starts;
2456 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 goto utf16Error;
2458 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002459 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002460 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2461 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002462 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002463#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002464 *p++ = ch;
2465 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002466#else
2467 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002468#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002469 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002470 }
2471 else {
2472 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 startinpos = (((const char *)q)-4)-starts;
2474 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002475 goto utf16Error;
2476 }
2477
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002479 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 startinpos = (((const char *)q)-2)-starts;
2481 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002482 /* Fall through to report the error */
2483
2484 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 outpos = p-PyUnicode_AS_UNICODE(unicode);
2486 if (unicode_decode_call_errorhandler(
2487 errors, &errorHandler,
2488 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002489 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002490 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002491 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 }
2493
2494 if (byteorder)
2495 *byteorder = bo;
2496
Walter Dörwald69652032004-09-07 20:24:22 +00002497 if (consumed)
2498 *consumed = (const char *)q-starts;
2499
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002501 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 goto onError;
2503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504 Py_XDECREF(errorHandler);
2505 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506 return (PyObject *)unicode;
2507
2508onError:
2509 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002510 Py_XDECREF(errorHandler);
2511 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 return NULL;
2513}
2514
Tim Peters772747b2001-08-09 22:21:55 +00002515PyObject *
2516PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002517 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002518 const char *errors,
2519 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520{
2521 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002522 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002523#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002524 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002525#else
2526 const int pairs = 0;
2527#endif
Tim Peters772747b2001-08-09 22:21:55 +00002528 /* Offsets from p for storing byte pairs in the right order. */
2529#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2530 int ihi = 1, ilo = 0;
2531#else
2532 int ihi = 0, ilo = 1;
2533#endif
2534
2535#define STORECHAR(CH) \
2536 do { \
2537 p[ihi] = ((CH) >> 8) & 0xff; \
2538 p[ilo] = (CH) & 0xff; \
2539 p += 2; \
2540 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002542#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002543 for (i = pairs = 0; i < size; i++)
2544 if (s[i] >= 0x10000)
2545 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002546#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002547 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002548 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 if (v == NULL)
2550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551
Walter Dörwald3cc34522007-05-04 10:48:27 +00002552 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002554 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002555 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002556 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002557
2558 if (byteorder == -1) {
2559 /* force LE */
2560 ihi = 1;
2561 ilo = 0;
2562 }
2563 else if (byteorder == 1) {
2564 /* force BE */
2565 ihi = 0;
2566 ilo = 1;
2567 }
2568
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002569 while (size-- > 0) {
2570 Py_UNICODE ch = *s++;
2571 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002572#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002573 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002574 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2575 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002577#endif
Tim Peters772747b2001-08-09 22:21:55 +00002578 STORECHAR(ch);
2579 if (ch2)
2580 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002583#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584}
2585
2586PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2587{
2588 if (!PyUnicode_Check(unicode)) {
2589 PyErr_BadArgument();
2590 return NULL;
2591 }
2592 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2593 PyUnicode_GET_SIZE(unicode),
2594 NULL,
2595 0);
2596}
2597
2598/* --- Unicode Escape Codec ----------------------------------------------- */
2599
Fredrik Lundh06d12682001-01-24 07:59:11 +00002600static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002601
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002603 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 const char *errors)
2605{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002607 Py_ssize_t startinpos;
2608 Py_ssize_t endinpos;
2609 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002614 char* message;
2615 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 PyObject *errorHandler = NULL;
2617 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002618
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 /* Escaped strings will always be longer than the resulting
2620 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 length after conversion to the true value.
2622 (but if the error callback returns a long replacement string
2623 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 v = _PyUnicode_New(size);
2625 if (v == NULL)
2626 goto onError;
2627 if (size == 0)
2628 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002630 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002632
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 while (s < end) {
2634 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002635 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637
2638 /* Non-escape characters are interpreted as Unicode ordinals */
2639 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002640 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 continue;
2642 }
2643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002644 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 /* \ - Escapes */
2646 s++;
2647 switch (*s++) {
2648
2649 /* \x escapes */
2650 case '\n': break;
2651 case '\\': *p++ = '\\'; break;
2652 case '\'': *p++ = '\''; break;
2653 case '\"': *p++ = '\"'; break;
2654 case 'b': *p++ = '\b'; break;
2655 case 'f': *p++ = '\014'; break; /* FF */
2656 case 't': *p++ = '\t'; break;
2657 case 'n': *p++ = '\n'; break;
2658 case 'r': *p++ = '\r'; break;
2659 case 'v': *p++ = '\013'; break; /* VT */
2660 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2661
2662 /* \OOO (octal) escapes */
2663 case '0': case '1': case '2': case '3':
2664 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002665 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002667 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002669 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002671 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 break;
2673
Fredrik Lundhccc74732001-02-18 22:13:49 +00002674 /* hex escapes */
2675 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002677 digits = 2;
2678 message = "truncated \\xXX escape";
2679 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680
Fredrik Lundhccc74732001-02-18 22:13:49 +00002681 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002683 digits = 4;
2684 message = "truncated \\uXXXX escape";
2685 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686
Fredrik Lundhccc74732001-02-18 22:13:49 +00002687 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002688 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002689 digits = 8;
2690 message = "truncated \\UXXXXXXXX escape";
2691 hexescape:
2692 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 outpos = p-PyUnicode_AS_UNICODE(v);
2694 if (s+digits>end) {
2695 endinpos = size;
2696 if (unicode_decode_call_errorhandler(
2697 errors, &errorHandler,
2698 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002699 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 (PyObject **)&v, &outpos, &p))
2701 goto onError;
2702 goto nextByte;
2703 }
2704 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002705 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002706 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 endinpos = (s+i+1)-starts;
2708 if (unicode_decode_call_errorhandler(
2709 errors, &errorHandler,
2710 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002711 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002713 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002714 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002715 }
2716 chr = (chr<<4) & ~0xF;
2717 if (c >= '0' && c <= '9')
2718 chr += c - '0';
2719 else if (c >= 'a' && c <= 'f')
2720 chr += 10 + c - 'a';
2721 else
2722 chr += 10 + c - 'A';
2723 }
2724 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002725 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 /* _decoding_error will have already written into the
2727 target buffer. */
2728 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002729 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002730 /* when we get here, chr is a 32-bit unicode character */
2731 if (chr <= 0xffff)
2732 /* UCS-2 character */
2733 *p++ = (Py_UNICODE) chr;
2734 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002735 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002736 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002737#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002738 *p++ = chr;
2739#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002740 chr -= 0x10000L;
2741 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002742 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002743#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002744 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 endinpos = s-starts;
2746 outpos = p-PyUnicode_AS_UNICODE(v);
2747 if (unicode_decode_call_errorhandler(
2748 errors, &errorHandler,
2749 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002750 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002752 goto onError;
2753 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754 break;
2755
2756 /* \N{name} */
2757 case 'N':
2758 message = "malformed \\N character escape";
2759 if (ucnhash_CAPI == NULL) {
2760 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002761 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762 m = PyImport_ImportModule("unicodedata");
2763 if (m == NULL)
2764 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002765 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002766 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002767 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002768 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002769 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002770 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002771 if (ucnhash_CAPI == NULL)
2772 goto ucnhashError;
2773 }
2774 if (*s == '{') {
2775 const char *start = s+1;
2776 /* look for the closing brace */
2777 while (*s != '}' && s < end)
2778 s++;
2779 if (s > start && s < end && *s == '}') {
2780 /* found a name. look it up in the unicode database */
2781 message = "unknown Unicode character name";
2782 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002783 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002784 goto store;
2785 }
2786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 endinpos = s-starts;
2788 outpos = p-PyUnicode_AS_UNICODE(v);
2789 if (unicode_decode_call_errorhandler(
2790 errors, &errorHandler,
2791 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002792 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002794 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 break;
2796
2797 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002798 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 message = "\\ at end of string";
2800 s--;
2801 endinpos = s-starts;
2802 outpos = p-PyUnicode_AS_UNICODE(v);
2803 if (unicode_decode_call_errorhandler(
2804 errors, &errorHandler,
2805 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002806 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002808 goto onError;
2809 }
2810 else {
2811 *p++ = '\\';
2812 *p++ = (unsigned char)s[-1];
2813 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002814 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 nextByte:
2817 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002819 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002821 Py_XDECREF(errorHandler);
2822 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002824
Fredrik Lundhccc74732001-02-18 22:13:49 +00002825ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002826 PyErr_SetString(
2827 PyExc_UnicodeError,
2828 "\\N escapes not supported (can't load unicodedata module)"
2829 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002830 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 Py_XDECREF(errorHandler);
2832 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002833 return NULL;
2834
Fredrik Lundhccc74732001-02-18 22:13:49 +00002835onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 Py_XDECREF(errorHandler);
2838 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 return NULL;
2840}
2841
2842/* Return a Unicode-Escape string version of the Unicode object.
2843
2844 If quotes is true, the string is enclosed in u"" or u'' quotes as
2845 appropriate.
2846
2847*/
2848
Thomas Wouters477c8d52006-05-27 19:21:47 +00002849Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2850 Py_ssize_t size,
2851 Py_UNICODE ch)
2852{
2853 /* like wcschr, but doesn't stop at NULL characters */
2854
2855 while (size-- > 0) {
2856 if (*s == ch)
2857 return s;
2858 s++;
2859 }
2860
2861 return NULL;
2862}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002863
Walter Dörwald79e913e2007-05-12 11:08:06 +00002864static const char *hexdigits = "0123456789abcdef";
2865
2866PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2867 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868{
2869 PyObject *repr;
2870 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871
Thomas Wouters89f507f2006-12-13 04:49:30 +00002872 /* XXX(nnorwitz): rather than over-allocating, it would be
2873 better to choose a different scheme. Perhaps scan the
2874 first N-chars of the string and allocate based on that size.
2875 */
2876 /* Initial allocation is based on the longest-possible unichr
2877 escape.
2878
2879 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2880 unichr, so in this case it's the longest unichr escape. In
2881 narrow (UTF-16) builds this is five chars per source unichr
2882 since there are two unichrs in the surrogate pair, so in narrow
2883 (UTF-16) builds it's not the longest unichr escape.
2884
2885 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2886 so in the narrow (UTF-16) build case it's the longest unichr
2887 escape.
2888 */
2889
Walter Dörwald79e913e2007-05-12 11:08:06 +00002890 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002891#ifdef Py_UNICODE_WIDE
2892 + 10*size
2893#else
2894 + 6*size
2895#endif
2896 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897 if (repr == NULL)
2898 return NULL;
2899
Walter Dörwald79e913e2007-05-12 11:08:06 +00002900 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 while (size-- > 0) {
2903 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002904
Walter Dörwald79e913e2007-05-12 11:08:06 +00002905 /* Escape backslashes */
2906 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 *p++ = '\\';
2908 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002909 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002910 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002911
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002912#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002913 /* Map 21-bit characters to '\U00xxxxxx' */
2914 else if (ch >= 0x10000) {
2915 *p++ = '\\';
2916 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002917 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2918 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2919 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2920 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2921 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2922 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2923 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2924 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002925 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002926 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002927#else
2928 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002929 else if (ch >= 0xD800 && ch < 0xDC00) {
2930 Py_UNICODE ch2;
2931 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002932
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002933 ch2 = *s++;
2934 size--;
2935 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2936 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2937 *p++ = '\\';
2938 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002939 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2940 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2941 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2942 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2943 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2944 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2945 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2946 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002947 continue;
2948 }
2949 /* Fall through: isolated surrogates are copied as-is */
2950 s--;
2951 size++;
2952 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002953#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002954
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002956 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957 *p++ = '\\';
2958 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002959 *p++ = hexdigits[(ch >> 12) & 0x000F];
2960 *p++ = hexdigits[(ch >> 8) & 0x000F];
2961 *p++ = hexdigits[(ch >> 4) & 0x000F];
2962 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002964
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002965 /* Map special whitespace to '\t', \n', '\r' */
2966 else if (ch == '\t') {
2967 *p++ = '\\';
2968 *p++ = 't';
2969 }
2970 else if (ch == '\n') {
2971 *p++ = '\\';
2972 *p++ = 'n';
2973 }
2974 else if (ch == '\r') {
2975 *p++ = '\\';
2976 *p++ = 'r';
2977 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002978
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002979 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002980 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002982 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002983 *p++ = hexdigits[(ch >> 4) & 0x000F];
2984 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002985 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002986
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 /* Copy everything else as-is */
2988 else
2989 *p++ = (char) ch;
2990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991
2992 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002993 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2994 Py_DECREF(repr);
2995 return NULL;
2996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 return repr;
2998}
2999
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3001{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003002 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 if (!PyUnicode_Check(unicode)) {
3004 PyErr_BadArgument();
3005 return NULL;
3006 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003007 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3008 PyUnicode_GET_SIZE(unicode));
3009
3010 if (!s)
3011 return NULL;
3012 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3013 PyBytes_GET_SIZE(s));
3014 Py_DECREF(s);
3015 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016}
3017
3018/* --- Raw Unicode Escape Codec ------------------------------------------- */
3019
3020PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003021 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 const char *errors)
3023{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003024 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003025 Py_ssize_t startinpos;
3026 Py_ssize_t endinpos;
3027 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 const char *end;
3031 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 PyObject *errorHandler = NULL;
3033 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003034
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 /* Escaped strings will always be longer than the resulting
3036 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 length after conversion to the true value. (But decoding error
3038 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 v = _PyUnicode_New(size);
3040 if (v == NULL)
3041 goto onError;
3042 if (size == 0)
3043 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 end = s + size;
3046 while (s < end) {
3047 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003048 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003050 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051
3052 /* Non-escape characters are interpreted as Unicode ordinals */
3053 if (*s != '\\') {
3054 *p++ = (unsigned char)*s++;
3055 continue;
3056 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058
3059 /* \u-escapes are only interpreted iff the number of leading
3060 backslashes if odd */
3061 bs = s;
3062 for (;s < end;) {
3063 if (*s != '\\')
3064 break;
3065 *p++ = (unsigned char)*s++;
3066 }
3067 if (((s - bs) & 1) == 0 ||
3068 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003069 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 continue;
3071 }
3072 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003073 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 s++;
3075
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003076 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003078 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 endinpos = s-starts;
3082 if (unicode_decode_call_errorhandler(
3083 errors, &errorHandler,
3084 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003085 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003086 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 }
3090 x = (x<<4) & ~0xF;
3091 if (c >= '0' && c <= '9')
3092 x += c - '0';
3093 else if (c >= 'a' && c <= 'f')
3094 x += 10 + c - 'a';
3095 else
3096 x += 10 + c - 'A';
3097 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003098#ifndef Py_UNICODE_WIDE
3099 if (x > 0x10000) {
3100 if (unicode_decode_call_errorhandler(
3101 errors, &errorHandler,
3102 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003104 (PyObject **)&v, &outpos, &p))
3105 goto onError;
3106 }
3107#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 *p++ = x;
3109 nextByte:
3110 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003112 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003113 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114 Py_XDECREF(errorHandler);
3115 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003117
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118 onError:
3119 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 Py_XDECREF(errorHandler);
3121 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 return NULL;
3123}
3124
3125PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127{
3128 PyObject *repr;
3129 char *p;
3130 char *q;
3131
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003132#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003133 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003134#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003135 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003136#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 if (repr == NULL)
3138 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003139 if (size == 0)
3140 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141
Walter Dörwald711005d2007-05-12 12:03:26 +00003142 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 while (size-- > 0) {
3144 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003145#ifdef Py_UNICODE_WIDE
3146 /* Map 32-bit characters to '\Uxxxxxxxx' */
3147 if (ch >= 0x10000) {
3148 *p++ = '\\';
3149 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003150 *p++ = hexdigits[(ch >> 28) & 0xf];
3151 *p++ = hexdigits[(ch >> 24) & 0xf];
3152 *p++ = hexdigits[(ch >> 20) & 0xf];
3153 *p++ = hexdigits[(ch >> 16) & 0xf];
3154 *p++ = hexdigits[(ch >> 12) & 0xf];
3155 *p++ = hexdigits[(ch >> 8) & 0xf];
3156 *p++ = hexdigits[(ch >> 4) & 0xf];
3157 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003158 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003159 else
3160#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 /* Map 16-bit characters to '\uxxxx' */
3162 if (ch >= 256) {
3163 *p++ = '\\';
3164 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003165 *p++ = hexdigits[(ch >> 12) & 0xf];
3166 *p++ = hexdigits[(ch >> 8) & 0xf];
3167 *p++ = hexdigits[(ch >> 4) & 0xf];
3168 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 }
3170 /* Copy everything else as-is */
3171 else
3172 *p++ = (char) ch;
3173 }
3174 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003175 if (PyBytes_Resize(repr, p - q)) {
3176 Py_DECREF(repr);
3177 return NULL;
3178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 return repr;
3180}
3181
3182PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3183{
Walter Dörwald711005d2007-05-12 12:03:26 +00003184 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003186 PyErr_BadArgument();
3187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003189 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3190 PyUnicode_GET_SIZE(unicode));
3191
3192 if (!s)
3193 return NULL;
3194 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3195 PyBytes_GET_SIZE(s));
3196 Py_DECREF(s);
3197 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198}
3199
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003200/* --- Unicode Internal Codec ------------------------------------------- */
3201
3202PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003203 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003204 const char *errors)
3205{
3206 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003207 Py_ssize_t startinpos;
3208 Py_ssize_t endinpos;
3209 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003210 PyUnicodeObject *v;
3211 Py_UNICODE *p;
3212 const char *end;
3213 const char *reason;
3214 PyObject *errorHandler = NULL;
3215 PyObject *exc = NULL;
3216
Neal Norwitzd43069c2006-01-08 01:12:10 +00003217#ifdef Py_UNICODE_WIDE
3218 Py_UNICODE unimax = PyUnicode_GetMax();
3219#endif
3220
Thomas Wouters89f507f2006-12-13 04:49:30 +00003221 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003222 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3223 if (v == NULL)
3224 goto onError;
3225 if (PyUnicode_GetSize((PyObject *)v) == 0)
3226 return (PyObject *)v;
3227 p = PyUnicode_AS_UNICODE(v);
3228 end = s + size;
3229
3230 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003231 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003232 /* We have to sanity check the raw data, otherwise doom looms for
3233 some malformed UCS-4 data. */
3234 if (
3235 #ifdef Py_UNICODE_WIDE
3236 *p > unimax || *p < 0 ||
3237 #endif
3238 end-s < Py_UNICODE_SIZE
3239 )
3240 {
3241 startinpos = s - starts;
3242 if (end-s < Py_UNICODE_SIZE) {
3243 endinpos = end-starts;
3244 reason = "truncated input";
3245 }
3246 else {
3247 endinpos = s - starts + Py_UNICODE_SIZE;
3248 reason = "illegal code point (> 0x10FFFF)";
3249 }
3250 outpos = p - PyUnicode_AS_UNICODE(v);
3251 if (unicode_decode_call_errorhandler(
3252 errors, &errorHandler,
3253 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003254 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003255 (PyObject **)&v, &outpos, &p)) {
3256 goto onError;
3257 }
3258 }
3259 else {
3260 p++;
3261 s += Py_UNICODE_SIZE;
3262 }
3263 }
3264
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003265 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003266 goto onError;
3267 Py_XDECREF(errorHandler);
3268 Py_XDECREF(exc);
3269 return (PyObject *)v;
3270
3271 onError:
3272 Py_XDECREF(v);
3273 Py_XDECREF(errorHandler);
3274 Py_XDECREF(exc);
3275 return NULL;
3276}
3277
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278/* --- Latin-1 Codec ------------------------------------------------------ */
3279
3280PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003281 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 const char *errors)
3283{
3284 PyUnicodeObject *v;
3285 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003286
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003288 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003289 Py_UNICODE r = *(unsigned char*)s;
3290 return PyUnicode_FromUnicode(&r, 1);
3291 }
3292
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 v = _PyUnicode_New(size);
3294 if (v == NULL)
3295 goto onError;
3296 if (size == 0)
3297 return (PyObject *)v;
3298 p = PyUnicode_AS_UNICODE(v);
3299 while (size-- > 0)
3300 *p++ = (unsigned char)*s++;
3301 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 onError:
3304 Py_XDECREF(v);
3305 return NULL;
3306}
3307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308/* create or adjust a UnicodeEncodeError */
3309static void make_encode_exception(PyObject **exceptionObject,
3310 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003311 const Py_UNICODE *unicode, Py_ssize_t size,
3312 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 if (*exceptionObject == NULL) {
3316 *exceptionObject = PyUnicodeEncodeError_Create(
3317 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 }
3319 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3321 goto onError;
3322 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3323 goto onError;
3324 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3325 goto onError;
3326 return;
3327 onError:
3328 Py_DECREF(*exceptionObject);
3329 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 }
3331}
3332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333/* raises a UnicodeEncodeError */
3334static void raise_encode_exception(PyObject **exceptionObject,
3335 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003336 const Py_UNICODE *unicode, Py_ssize_t size,
3337 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 const char *reason)
3339{
3340 make_encode_exception(exceptionObject,
3341 encoding, unicode, size, startpos, endpos, reason);
3342 if (*exceptionObject != NULL)
3343 PyCodec_StrictErrors(*exceptionObject);
3344}
3345
3346/* error handling callback helper:
3347 build arguments, call the callback and check the arguments,
3348 put the result into newpos and return the replacement string, which
3349 has to be freed by the caller */
3350static PyObject *unicode_encode_call_errorhandler(const char *errors,
3351 PyObject **errorHandler,
3352 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003353 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3354 Py_ssize_t startpos, Py_ssize_t endpos,
3355 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003357 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358
3359 PyObject *restuple;
3360 PyObject *resunicode;
3361
3362 if (*errorHandler == NULL) {
3363 *errorHandler = PyCodec_LookupError(errors);
3364 if (*errorHandler == NULL)
3365 return NULL;
3366 }
3367
3368 make_encode_exception(exceptionObject,
3369 encoding, unicode, size, startpos, endpos, reason);
3370 if (*exceptionObject == NULL)
3371 return NULL;
3372
3373 restuple = PyObject_CallFunctionObjArgs(
3374 *errorHandler, *exceptionObject, NULL);
3375 if (restuple == NULL)
3376 return NULL;
3377 if (!PyTuple_Check(restuple)) {
3378 PyErr_Format(PyExc_TypeError, &argparse[4]);
3379 Py_DECREF(restuple);
3380 return NULL;
3381 }
3382 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3383 &resunicode, newpos)) {
3384 Py_DECREF(restuple);
3385 return NULL;
3386 }
3387 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003388 *newpos = size+*newpos;
3389 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003390 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003391 Py_DECREF(restuple);
3392 return NULL;
3393 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394 Py_INCREF(resunicode);
3395 Py_DECREF(restuple);
3396 return resunicode;
3397}
3398
3399static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003400 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 const char *errors,
3402 int limit)
3403{
3404 /* output object */
3405 PyObject *res;
3406 /* pointers to the beginning and end+1 of input */
3407 const Py_UNICODE *startp = p;
3408 const Py_UNICODE *endp = p + size;
3409 /* pointer to the beginning of the unencodable characters */
3410 /* const Py_UNICODE *badp = NULL; */
3411 /* pointer into the output */
3412 char *str;
3413 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003414 Py_ssize_t respos = 0;
3415 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003416 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3417 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418 PyObject *errorHandler = NULL;
3419 PyObject *exc = NULL;
3420 /* the following variable is used for caching string comparisons
3421 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3422 int known_errorHandler = -1;
3423
3424 /* allocate enough for a simple encoding without
3425 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003426 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 if (res == NULL)
3428 goto onError;
3429 if (size == 0)
3430 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003431 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432 ressize = size;
3433
3434 while (p<endp) {
3435 Py_UNICODE c = *p;
3436
3437 /* can we encode this? */
3438 if (c<limit) {
3439 /* no overflow check, because we know that the space is enough */
3440 *str++ = (char)c;
3441 ++p;
3442 }
3443 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003444 Py_ssize_t unicodepos = p-startp;
3445 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003447 Py_ssize_t repsize;
3448 Py_ssize_t newpos;
3449 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 Py_UNICODE *uni2;
3451 /* startpos for collecting unencodable chars */
3452 const Py_UNICODE *collstart = p;
3453 const Py_UNICODE *collend = p;
3454 /* find all unecodable characters */
3455 while ((collend < endp) && ((*collend)>=limit))
3456 ++collend;
3457 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3458 if (known_errorHandler==-1) {
3459 if ((errors==NULL) || (!strcmp(errors, "strict")))
3460 known_errorHandler = 1;
3461 else if (!strcmp(errors, "replace"))
3462 known_errorHandler = 2;
3463 else if (!strcmp(errors, "ignore"))
3464 known_errorHandler = 3;
3465 else if (!strcmp(errors, "xmlcharrefreplace"))
3466 known_errorHandler = 4;
3467 else
3468 known_errorHandler = 0;
3469 }
3470 switch (known_errorHandler) {
3471 case 1: /* strict */
3472 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3473 goto onError;
3474 case 2: /* replace */
3475 while (collstart++<collend)
3476 *str++ = '?'; /* fall through */
3477 case 3: /* ignore */
3478 p = collend;
3479 break;
3480 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003481 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 /* determine replacement size (temporarily (mis)uses p) */
3483 for (p = collstart, repsize = 0; p < collend; ++p) {
3484 if (*p<10)
3485 repsize += 2+1+1;
3486 else if (*p<100)
3487 repsize += 2+2+1;
3488 else if (*p<1000)
3489 repsize += 2+3+1;
3490 else if (*p<10000)
3491 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003492#ifndef Py_UNICODE_WIDE
3493 else
3494 repsize += 2+5+1;
3495#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 else if (*p<100000)
3497 repsize += 2+5+1;
3498 else if (*p<1000000)
3499 repsize += 2+6+1;
3500 else
3501 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003502#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 }
3504 requiredsize = respos+repsize+(endp-collend);
3505 if (requiredsize > ressize) {
3506 if (requiredsize<2*ressize)
3507 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003508 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003510 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 ressize = requiredsize;
3512 }
3513 /* generate replacement (temporarily (mis)uses p) */
3514 for (p = collstart; p < collend; ++p) {
3515 str += sprintf(str, "&#%d;", (int)*p);
3516 }
3517 p = collend;
3518 break;
3519 default:
3520 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3521 encoding, reason, startp, size, &exc,
3522 collstart-startp, collend-startp, &newpos);
3523 if (repunicode == NULL)
3524 goto onError;
3525 /* need more space? (at least enough for what we
3526 have+the replacement+the rest of the string, so
3527 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003528 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 repsize = PyUnicode_GET_SIZE(repunicode);
3530 requiredsize = respos+repsize+(endp-collend);
3531 if (requiredsize > ressize) {
3532 if (requiredsize<2*ressize)
3533 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003534 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 Py_DECREF(repunicode);
3536 goto onError;
3537 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003538 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 ressize = requiredsize;
3540 }
3541 /* check if there is anything unencodable in the replacement
3542 and copy it to the output */
3543 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3544 c = *uni2;
3545 if (c >= limit) {
3546 raise_encode_exception(&exc, encoding, startp, size,
3547 unicodepos, unicodepos+1, reason);
3548 Py_DECREF(repunicode);
3549 goto onError;
3550 }
3551 *str = (char)c;
3552 }
3553 p = startp + newpos;
3554 Py_DECREF(repunicode);
3555 }
3556 }
3557 }
3558 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003559 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 if (respos<ressize)
3561 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003562 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 Py_XDECREF(errorHandler);
3564 Py_XDECREF(exc);
3565 return res;
3566
3567 onError:
3568 Py_XDECREF(res);
3569 Py_XDECREF(errorHandler);
3570 Py_XDECREF(exc);
3571 return NULL;
3572}
3573
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 const char *errors)
3577{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579}
3580
3581PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3582{
3583 if (!PyUnicode_Check(unicode)) {
3584 PyErr_BadArgument();
3585 return NULL;
3586 }
3587 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3588 PyUnicode_GET_SIZE(unicode),
3589 NULL);
3590}
3591
3592/* --- 7-bit ASCII Codec -------------------------------------------------- */
3593
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003595 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 const char *errors)
3597{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 PyUnicodeObject *v;
3600 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003601 Py_ssize_t startinpos;
3602 Py_ssize_t endinpos;
3603 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 const char *e;
3605 PyObject *errorHandler = NULL;
3606 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003607
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003609 if (size == 1 && *(unsigned char*)s < 128) {
3610 Py_UNICODE r = *(unsigned char*)s;
3611 return PyUnicode_FromUnicode(&r, 1);
3612 }
Tim Petersced69f82003-09-16 20:30:58 +00003613
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 v = _PyUnicode_New(size);
3615 if (v == NULL)
3616 goto onError;
3617 if (size == 0)
3618 return (PyObject *)v;
3619 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 e = s + size;
3621 while (s < e) {
3622 register unsigned char c = (unsigned char)*s;
3623 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 ++s;
3626 }
3627 else {
3628 startinpos = s-starts;
3629 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003630 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 if (unicode_decode_call_errorhandler(
3632 errors, &errorHandler,
3633 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003634 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003639 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003640 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003641 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 Py_XDECREF(errorHandler);
3643 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 onError:
3647 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 Py_XDECREF(errorHandler);
3649 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 return NULL;
3651}
3652
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003654 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 const char *errors)
3656{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658}
3659
3660PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3661{
3662 if (!PyUnicode_Check(unicode)) {
3663 PyErr_BadArgument();
3664 return NULL;
3665 }
3666 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3667 PyUnicode_GET_SIZE(unicode),
3668 NULL);
3669}
3670
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003671#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003672
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003673/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003674
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003675#if SIZEOF_INT < SIZEOF_SSIZE_T
3676#define NEED_RETRY
3677#endif
3678
3679/* XXX This code is limited to "true" double-byte encodings, as
3680 a) it assumes an incomplete character consists of a single byte, and
3681 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3682 encodings, see IsDBCSLeadByteEx documentation. */
3683
3684static int is_dbcs_lead_byte(const char *s, int offset)
3685{
3686 const char *curr = s + offset;
3687
3688 if (IsDBCSLeadByte(*curr)) {
3689 const char *prev = CharPrev(s, curr);
3690 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3691 }
3692 return 0;
3693}
3694
3695/*
3696 * Decode MBCS string into unicode object. If 'final' is set, converts
3697 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3698 */
3699static int decode_mbcs(PyUnicodeObject **v,
3700 const char *s, /* MBCS string */
3701 int size, /* sizeof MBCS string */
3702 int final)
3703{
3704 Py_UNICODE *p;
3705 Py_ssize_t n = 0;
3706 int usize = 0;
3707
3708 assert(size >= 0);
3709
3710 /* Skip trailing lead-byte unless 'final' is set */
3711 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3712 --size;
3713
3714 /* First get the size of the result */
3715 if (size > 0) {
3716 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3717 if (usize == 0) {
3718 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3719 return -1;
3720 }
3721 }
3722
3723 if (*v == NULL) {
3724 /* Create unicode object */
3725 *v = _PyUnicode_New(usize);
3726 if (*v == NULL)
3727 return -1;
3728 }
3729 else {
3730 /* Extend unicode object */
3731 n = PyUnicode_GET_SIZE(*v);
3732 if (_PyUnicode_Resize(v, n + usize) < 0)
3733 return -1;
3734 }
3735
3736 /* Do the conversion */
3737 if (size > 0) {
3738 p = PyUnicode_AS_UNICODE(*v) + n;
3739 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3740 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3741 return -1;
3742 }
3743 }
3744
3745 return size;
3746}
3747
3748PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3749 Py_ssize_t size,
3750 const char *errors,
3751 Py_ssize_t *consumed)
3752{
3753 PyUnicodeObject *v = NULL;
3754 int done;
3755
3756 if (consumed)
3757 *consumed = 0;
3758
3759#ifdef NEED_RETRY
3760 retry:
3761 if (size > INT_MAX)
3762 done = decode_mbcs(&v, s, INT_MAX, 0);
3763 else
3764#endif
3765 done = decode_mbcs(&v, s, (int)size, !consumed);
3766
3767 if (done < 0) {
3768 Py_XDECREF(v);
3769 return NULL;
3770 }
3771
3772 if (consumed)
3773 *consumed += done;
3774
3775#ifdef NEED_RETRY
3776 if (size > INT_MAX) {
3777 s += done;
3778 size -= done;
3779 goto retry;
3780 }
3781#endif
3782
3783 return (PyObject *)v;
3784}
3785
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003786PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003787 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003788 const char *errors)
3789{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003790 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3791}
3792
3793/*
3794 * Convert unicode into string object (MBCS).
3795 * Returns 0 if succeed, -1 otherwise.
3796 */
3797static int encode_mbcs(PyObject **repr,
3798 const Py_UNICODE *p, /* unicode */
3799 int size) /* size of unicode */
3800{
3801 int mbcssize = 0;
3802 Py_ssize_t n = 0;
3803
3804 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003805
3806 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003807 if (size > 0) {
3808 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3809 if (mbcssize == 0) {
3810 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3811 return -1;
3812 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003813 }
3814
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003815 if (*repr == NULL) {
3816 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003817 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003818 if (*repr == NULL)
3819 return -1;
3820 }
3821 else {
3822 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003823 n = PyBytes_Size(*repr);
3824 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003825 return -1;
3826 }
3827
3828 /* Do the conversion */
3829 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003830 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003831 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3832 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3833 return -1;
3834 }
3835 }
3836
3837 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003838}
3839
3840PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003841 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003842 const char *errors)
3843{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003844 PyObject *repr = NULL;
3845 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003846
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003847#ifdef NEED_RETRY
3848 retry:
3849 if (size > INT_MAX)
3850 ret = encode_mbcs(&repr, p, INT_MAX);
3851 else
3852#endif
3853 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003854
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003855 if (ret < 0) {
3856 Py_XDECREF(repr);
3857 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003858 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003859
3860#ifdef NEED_RETRY
3861 if (size > INT_MAX) {
3862 p += INT_MAX;
3863 size -= INT_MAX;
3864 goto retry;
3865 }
3866#endif
3867
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003868 return repr;
3869}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003870
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003871PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3872{
3873 if (!PyUnicode_Check(unicode)) {
3874 PyErr_BadArgument();
3875 return NULL;
3876 }
3877 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3878 PyUnicode_GET_SIZE(unicode),
3879 NULL);
3880}
3881
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003882#undef NEED_RETRY
3883
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003884#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003885
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886/* --- Character Mapping Codec -------------------------------------------- */
3887
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003889 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 PyObject *mapping,
3891 const char *errors)
3892{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003894 Py_ssize_t startinpos;
3895 Py_ssize_t endinpos;
3896 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 PyUnicodeObject *v;
3899 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003900 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 PyObject *errorHandler = NULL;
3902 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003903 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003904 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003905
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 /* Default to Latin-1 */
3907 if (mapping == NULL)
3908 return PyUnicode_DecodeLatin1(s, size, errors);
3909
3910 v = _PyUnicode_New(size);
3911 if (v == NULL)
3912 goto onError;
3913 if (size == 0)
3914 return (PyObject *)v;
3915 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003917 if (PyUnicode_CheckExact(mapping)) {
3918 mapstring = PyUnicode_AS_UNICODE(mapping);
3919 maplen = PyUnicode_GET_SIZE(mapping);
3920 while (s < e) {
3921 unsigned char ch = *s;
3922 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003924 if (ch < maplen)
3925 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003927 if (x == 0xfffe) {
3928 /* undefined mapping */
3929 outpos = p-PyUnicode_AS_UNICODE(v);
3930 startinpos = s-starts;
3931 endinpos = startinpos+1;
3932 if (unicode_decode_call_errorhandler(
3933 errors, &errorHandler,
3934 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003935 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003936 (PyObject **)&v, &outpos, &p)) {
3937 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003938 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003939 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003940 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003941 *p++ = x;
3942 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003944 }
3945 else {
3946 while (s < e) {
3947 unsigned char ch = *s;
3948 PyObject *w, *x;
3949
3950 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3951 w = PyInt_FromLong((long)ch);
3952 if (w == NULL)
3953 goto onError;
3954 x = PyObject_GetItem(mapping, w);
3955 Py_DECREF(w);
3956 if (x == NULL) {
3957 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3958 /* No mapping found means: mapping is undefined. */
3959 PyErr_Clear();
3960 x = Py_None;
3961 Py_INCREF(x);
3962 } else
3963 goto onError;
3964 }
3965
3966 /* Apply mapping */
3967 if (PyInt_Check(x)) {
3968 long value = PyInt_AS_LONG(x);
3969 if (value < 0 || value > 65535) {
3970 PyErr_SetString(PyExc_TypeError,
3971 "character mapping must be in range(65536)");
3972 Py_DECREF(x);
3973 goto onError;
3974 }
3975 *p++ = (Py_UNICODE)value;
3976 }
3977 else if (x == Py_None) {
3978 /* undefined mapping */
3979 outpos = p-PyUnicode_AS_UNICODE(v);
3980 startinpos = s-starts;
3981 endinpos = startinpos+1;
3982 if (unicode_decode_call_errorhandler(
3983 errors, &errorHandler,
3984 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003985 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003986 (PyObject **)&v, &outpos, &p)) {
3987 Py_DECREF(x);
3988 goto onError;
3989 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003990 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003991 continue;
3992 }
3993 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003994 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003995
3996 if (targetsize == 1)
3997 /* 1-1 mapping */
3998 *p++ = *PyUnicode_AS_UNICODE(x);
3999
4000 else if (targetsize > 1) {
4001 /* 1-n mapping */
4002 if (targetsize > extrachars) {
4003 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004004 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4005 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004006 (targetsize << 2);
4007 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004008 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004009 if (_PyUnicode_Resize(&v,
4010 PyUnicode_GET_SIZE(v) + needed) < 0) {
4011 Py_DECREF(x);
4012 goto onError;
4013 }
4014 p = PyUnicode_AS_UNICODE(v) + oldpos;
4015 }
4016 Py_UNICODE_COPY(p,
4017 PyUnicode_AS_UNICODE(x),
4018 targetsize);
4019 p += targetsize;
4020 extrachars -= targetsize;
4021 }
4022 /* 1-0 mapping: skip the character */
4023 }
4024 else {
4025 /* wrong return value */
4026 PyErr_SetString(PyExc_TypeError,
4027 "character mapping must return integer, None or unicode");
4028 Py_DECREF(x);
4029 goto onError;
4030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004032 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 }
4035 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004036 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 Py_XDECREF(errorHandler);
4039 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004041
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 Py_XDECREF(errorHandler);
4044 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 Py_XDECREF(v);
4046 return NULL;
4047}
4048
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004049/* Charmap encoding: the lookup table */
4050
4051struct encoding_map{
4052 PyObject_HEAD
4053 unsigned char level1[32];
4054 int count2, count3;
4055 unsigned char level23[1];
4056};
4057
4058static PyObject*
4059encoding_map_size(PyObject *obj, PyObject* args)
4060{
4061 struct encoding_map *map = (struct encoding_map*)obj;
4062 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4063 128*map->count3);
4064}
4065
4066static PyMethodDef encoding_map_methods[] = {
4067 {"size", encoding_map_size, METH_NOARGS,
4068 PyDoc_STR("Return the size (in bytes) of this object") },
4069 { 0 }
4070};
4071
4072static void
4073encoding_map_dealloc(PyObject* o)
4074{
4075 PyObject_FREE(o);
4076}
4077
4078static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004079 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004080 "EncodingMap", /*tp_name*/
4081 sizeof(struct encoding_map), /*tp_basicsize*/
4082 0, /*tp_itemsize*/
4083 /* methods */
4084 encoding_map_dealloc, /*tp_dealloc*/
4085 0, /*tp_print*/
4086 0, /*tp_getattr*/
4087 0, /*tp_setattr*/
4088 0, /*tp_compare*/
4089 0, /*tp_repr*/
4090 0, /*tp_as_number*/
4091 0, /*tp_as_sequence*/
4092 0, /*tp_as_mapping*/
4093 0, /*tp_hash*/
4094 0, /*tp_call*/
4095 0, /*tp_str*/
4096 0, /*tp_getattro*/
4097 0, /*tp_setattro*/
4098 0, /*tp_as_buffer*/
4099 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4100 0, /*tp_doc*/
4101 0, /*tp_traverse*/
4102 0, /*tp_clear*/
4103 0, /*tp_richcompare*/
4104 0, /*tp_weaklistoffset*/
4105 0, /*tp_iter*/
4106 0, /*tp_iternext*/
4107 encoding_map_methods, /*tp_methods*/
4108 0, /*tp_members*/
4109 0, /*tp_getset*/
4110 0, /*tp_base*/
4111 0, /*tp_dict*/
4112 0, /*tp_descr_get*/
4113 0, /*tp_descr_set*/
4114 0, /*tp_dictoffset*/
4115 0, /*tp_init*/
4116 0, /*tp_alloc*/
4117 0, /*tp_new*/
4118 0, /*tp_free*/
4119 0, /*tp_is_gc*/
4120};
4121
4122PyObject*
4123PyUnicode_BuildEncodingMap(PyObject* string)
4124{
4125 Py_UNICODE *decode;
4126 PyObject *result;
4127 struct encoding_map *mresult;
4128 int i;
4129 int need_dict = 0;
4130 unsigned char level1[32];
4131 unsigned char level2[512];
4132 unsigned char *mlevel1, *mlevel2, *mlevel3;
4133 int count2 = 0, count3 = 0;
4134
4135 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4136 PyErr_BadArgument();
4137 return NULL;
4138 }
4139 decode = PyUnicode_AS_UNICODE(string);
4140 memset(level1, 0xFF, sizeof level1);
4141 memset(level2, 0xFF, sizeof level2);
4142
4143 /* If there isn't a one-to-one mapping of NULL to \0,
4144 or if there are non-BMP characters, we need to use
4145 a mapping dictionary. */
4146 if (decode[0] != 0)
4147 need_dict = 1;
4148 for (i = 1; i < 256; i++) {
4149 int l1, l2;
4150 if (decode[i] == 0
4151 #ifdef Py_UNICODE_WIDE
4152 || decode[i] > 0xFFFF
4153 #endif
4154 ) {
4155 need_dict = 1;
4156 break;
4157 }
4158 if (decode[i] == 0xFFFE)
4159 /* unmapped character */
4160 continue;
4161 l1 = decode[i] >> 11;
4162 l2 = decode[i] >> 7;
4163 if (level1[l1] == 0xFF)
4164 level1[l1] = count2++;
4165 if (level2[l2] == 0xFF)
4166 level2[l2] = count3++;
4167 }
4168
4169 if (count2 >= 0xFF || count3 >= 0xFF)
4170 need_dict = 1;
4171
4172 if (need_dict) {
4173 PyObject *result = PyDict_New();
4174 PyObject *key, *value;
4175 if (!result)
4176 return NULL;
4177 for (i = 0; i < 256; i++) {
4178 key = value = NULL;
4179 key = PyInt_FromLong(decode[i]);
4180 value = PyInt_FromLong(i);
4181 if (!key || !value)
4182 goto failed1;
4183 if (PyDict_SetItem(result, key, value) == -1)
4184 goto failed1;
4185 Py_DECREF(key);
4186 Py_DECREF(value);
4187 }
4188 return result;
4189 failed1:
4190 Py_XDECREF(key);
4191 Py_XDECREF(value);
4192 Py_DECREF(result);
4193 return NULL;
4194 }
4195
4196 /* Create a three-level trie */
4197 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4198 16*count2 + 128*count3 - 1);
4199 if (!result)
4200 return PyErr_NoMemory();
4201 PyObject_Init(result, &EncodingMapType);
4202 mresult = (struct encoding_map*)result;
4203 mresult->count2 = count2;
4204 mresult->count3 = count3;
4205 mlevel1 = mresult->level1;
4206 mlevel2 = mresult->level23;
4207 mlevel3 = mresult->level23 + 16*count2;
4208 memcpy(mlevel1, level1, 32);
4209 memset(mlevel2, 0xFF, 16*count2);
4210 memset(mlevel3, 0, 128*count3);
4211 count3 = 0;
4212 for (i = 1; i < 256; i++) {
4213 int o1, o2, o3, i2, i3;
4214 if (decode[i] == 0xFFFE)
4215 /* unmapped character */
4216 continue;
4217 o1 = decode[i]>>11;
4218 o2 = (decode[i]>>7) & 0xF;
4219 i2 = 16*mlevel1[o1] + o2;
4220 if (mlevel2[i2] == 0xFF)
4221 mlevel2[i2] = count3++;
4222 o3 = decode[i] & 0x7F;
4223 i3 = 128*mlevel2[i2] + o3;
4224 mlevel3[i3] = i;
4225 }
4226 return result;
4227}
4228
4229static int
4230encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4231{
4232 struct encoding_map *map = (struct encoding_map*)mapping;
4233 int l1 = c>>11;
4234 int l2 = (c>>7) & 0xF;
4235 int l3 = c & 0x7F;
4236 int i;
4237
4238#ifdef Py_UNICODE_WIDE
4239 if (c > 0xFFFF) {
4240 return -1;
4241 }
4242#endif
4243 if (c == 0)
4244 return 0;
4245 /* level 1*/
4246 i = map->level1[l1];
4247 if (i == 0xFF) {
4248 return -1;
4249 }
4250 /* level 2*/
4251 i = map->level23[16*i+l2];
4252 if (i == 0xFF) {
4253 return -1;
4254 }
4255 /* level 3 */
4256 i = map->level23[16*map->count2 + 128*i + l3];
4257 if (i == 0) {
4258 return -1;
4259 }
4260 return i;
4261}
4262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263/* Lookup the character ch in the mapping. If the character
4264 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004265 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 PyObject *w = PyInt_FromLong((long)c);
4269 PyObject *x;
4270
4271 if (w == NULL)
4272 return NULL;
4273 x = PyObject_GetItem(mapping, w);
4274 Py_DECREF(w);
4275 if (x == NULL) {
4276 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4277 /* No mapping found means: mapping is undefined. */
4278 PyErr_Clear();
4279 x = Py_None;
4280 Py_INCREF(x);
4281 return x;
4282 } else
4283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004285 else if (x == Py_None)
4286 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 else if (PyInt_Check(x)) {
4288 long value = PyInt_AS_LONG(x);
4289 if (value < 0 || value > 255) {
4290 PyErr_SetString(PyExc_TypeError,
4291 "character mapping must be in range(256)");
4292 Py_DECREF(x);
4293 return NULL;
4294 }
4295 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 else if (PyString_Check(x))
4298 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004301 PyErr_Format(PyExc_TypeError,
4302 "character mapping must return integer, None or str8, not %.400s",
4303 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 Py_DECREF(x);
4305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 }
4307}
4308
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004309static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004310charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004311{
Walter Dörwald827b0552007-05-12 13:23:53 +00004312 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004313 /* exponentially overallocate to minimize reallocations */
4314 if (requiredsize < 2*outsize)
4315 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004316 if (PyBytes_Resize(outobj, requiredsize)) {
4317 Py_DECREF(outobj);
4318 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004319 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004320 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004321}
4322
4323typedef enum charmapencode_result {
4324 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4325}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004327 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 space is available. Return a new reference to the object that
4329 was put in the output buffer, or Py_None, if the mapping was undefined
4330 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004331 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004333charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004334 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004336 PyObject *rep;
4337 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004338 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004340 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341 int res = encoding_map_lookup(c, mapping);
4342 Py_ssize_t requiredsize = *outpos+1;
4343 if (res == -1)
4344 return enc_FAILED;
4345 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004346 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004347 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004348 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004349 outstart[(*outpos)++] = (char)res;
4350 return enc_SUCCESS;
4351 }
4352
4353 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004355 return enc_EXCEPTION;
4356 else if (rep==Py_None) {
4357 Py_DECREF(rep);
4358 return enc_FAILED;
4359 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004361 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004362 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004363 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004365 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004367 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4369 }
4370 else {
4371 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004372 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4373 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004374 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004375 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004377 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004379 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 memcpy(outstart + *outpos, repchars, repsize);
4381 *outpos += repsize;
4382 }
4383 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004384 Py_DECREF(rep);
4385 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386}
4387
4388/* handle an error in PyUnicode_EncodeCharmap
4389 Return 0 on success, -1 on error */
4390static
4391int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004392 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004394 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004395 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396{
4397 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004398 Py_ssize_t repsize;
4399 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 Py_UNICODE *uni2;
4401 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004402 Py_ssize_t collstartpos = *inpos;
4403 Py_ssize_t collendpos = *inpos+1;
4404 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 char *encoding = "charmap";
4406 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004407 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 /* find all unencodable characters */
4410 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004411 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004412 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004413 int res = encoding_map_lookup(p[collendpos], mapping);
4414 if (res != -1)
4415 break;
4416 ++collendpos;
4417 continue;
4418 }
4419
4420 rep = charmapencode_lookup(p[collendpos], mapping);
4421 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004423 else if (rep!=Py_None) {
4424 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 break;
4426 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004427 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 ++collendpos;
4429 }
4430 /* cache callback name lookup
4431 * (if not done yet, i.e. it's the first error) */
4432 if (*known_errorHandler==-1) {
4433 if ((errors==NULL) || (!strcmp(errors, "strict")))
4434 *known_errorHandler = 1;
4435 else if (!strcmp(errors, "replace"))
4436 *known_errorHandler = 2;
4437 else if (!strcmp(errors, "ignore"))
4438 *known_errorHandler = 3;
4439 else if (!strcmp(errors, "xmlcharrefreplace"))
4440 *known_errorHandler = 4;
4441 else
4442 *known_errorHandler = 0;
4443 }
4444 switch (*known_errorHandler) {
4445 case 1: /* strict */
4446 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4447 return -1;
4448 case 2: /* replace */
4449 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4450 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004451 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 return -1;
4453 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004454 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4456 return -1;
4457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 }
4459 /* fall through */
4460 case 3: /* ignore */
4461 *inpos = collendpos;
4462 break;
4463 case 4: /* xmlcharrefreplace */
4464 /* generate replacement (temporarily (mis)uses p) */
4465 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4466 char buffer[2+29+1+1];
4467 char *cp;
4468 sprintf(buffer, "&#%d;", (int)p[collpos]);
4469 for (cp = buffer; *cp; ++cp) {
4470 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004471 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004473 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4475 return -1;
4476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 }
4478 }
4479 *inpos = collendpos;
4480 break;
4481 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004482 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 encoding, reason, p, size, exceptionObject,
4484 collstartpos, collendpos, &newpos);
4485 if (repunicode == NULL)
4486 return -1;
4487 /* generate replacement */
4488 repsize = PyUnicode_GET_SIZE(repunicode);
4489 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4490 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004491 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 return -1;
4493 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004494 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4497 return -1;
4498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 }
4500 *inpos = newpos;
4501 Py_DECREF(repunicode);
4502 }
4503 return 0;
4504}
4505
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004507 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 PyObject *mapping,
4509 const char *errors)
4510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 /* output object */
4512 PyObject *res = NULL;
4513 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004514 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004516 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 PyObject *errorHandler = NULL;
4518 PyObject *exc = NULL;
4519 /* the following variable is used for caching string comparisons
4520 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4521 * 3=ignore, 4=xmlcharrefreplace */
4522 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523
4524 /* Default to Latin-1 */
4525 if (mapping == NULL)
4526 return PyUnicode_EncodeLatin1(p, size, errors);
4527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 /* allocate enough for a simple encoding without
4529 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004530 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 if (res == NULL)
4532 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004533 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 while (inpos<size) {
4537 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004538 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004539 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004541 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 if (charmap_encoding_error(p, size, &inpos, mapping,
4543 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004544 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004545 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004546 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 else
4550 /* done with this character => adjust input position */
4551 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004555 if (respos<PyBytes_GET_SIZE(res)) {
4556 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 goto onError;
4558 }
4559 Py_XDECREF(exc);
4560 Py_XDECREF(errorHandler);
4561 return res;
4562
4563 onError:
4564 Py_XDECREF(res);
4565 Py_XDECREF(exc);
4566 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 return NULL;
4568}
4569
4570PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4571 PyObject *mapping)
4572{
4573 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4574 PyErr_BadArgument();
4575 return NULL;
4576 }
4577 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4578 PyUnicode_GET_SIZE(unicode),
4579 mapping,
4580 NULL);
4581}
4582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583/* create or adjust a UnicodeTranslateError */
4584static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004585 const Py_UNICODE *unicode, Py_ssize_t size,
4586 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 if (*exceptionObject == NULL) {
4590 *exceptionObject = PyUnicodeTranslateError_Create(
4591 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 }
4593 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4595 goto onError;
4596 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4597 goto onError;
4598 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4599 goto onError;
4600 return;
4601 onError:
4602 Py_DECREF(*exceptionObject);
4603 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 }
4605}
4606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607/* raises a UnicodeTranslateError */
4608static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004609 const Py_UNICODE *unicode, Py_ssize_t size,
4610 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 const char *reason)
4612{
4613 make_translate_exception(exceptionObject,
4614 unicode, size, startpos, endpos, reason);
4615 if (*exceptionObject != NULL)
4616 PyCodec_StrictErrors(*exceptionObject);
4617}
4618
4619/* error handling callback helper:
4620 build arguments, call the callback and check the arguments,
4621 put the result into newpos and return the replacement string, which
4622 has to be freed by the caller */
4623static PyObject *unicode_translate_call_errorhandler(const char *errors,
4624 PyObject **errorHandler,
4625 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004626 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4627 Py_ssize_t startpos, Py_ssize_t endpos,
4628 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004630 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004632 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 PyObject *restuple;
4634 PyObject *resunicode;
4635
4636 if (*errorHandler == NULL) {
4637 *errorHandler = PyCodec_LookupError(errors);
4638 if (*errorHandler == NULL)
4639 return NULL;
4640 }
4641
4642 make_translate_exception(exceptionObject,
4643 unicode, size, startpos, endpos, reason);
4644 if (*exceptionObject == NULL)
4645 return NULL;
4646
4647 restuple = PyObject_CallFunctionObjArgs(
4648 *errorHandler, *exceptionObject, NULL);
4649 if (restuple == NULL)
4650 return NULL;
4651 if (!PyTuple_Check(restuple)) {
4652 PyErr_Format(PyExc_TypeError, &argparse[4]);
4653 Py_DECREF(restuple);
4654 return NULL;
4655 }
4656 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004657 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 Py_DECREF(restuple);
4659 return NULL;
4660 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004661 if (i_newpos<0)
4662 *newpos = size+i_newpos;
4663 else
4664 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004665 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004666 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004667 Py_DECREF(restuple);
4668 return NULL;
4669 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 Py_INCREF(resunicode);
4671 Py_DECREF(restuple);
4672 return resunicode;
4673}
4674
4675/* Lookup the character ch in the mapping and put the result in result,
4676 which must be decrefed by the caller.
4677 Return 0 on success, -1 on error */
4678static
4679int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4680{
4681 PyObject *w = PyInt_FromLong((long)c);
4682 PyObject *x;
4683
4684 if (w == NULL)
4685 return -1;
4686 x = PyObject_GetItem(mapping, w);
4687 Py_DECREF(w);
4688 if (x == NULL) {
4689 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4690 /* No mapping found means: use 1:1 mapping. */
4691 PyErr_Clear();
4692 *result = NULL;
4693 return 0;
4694 } else
4695 return -1;
4696 }
4697 else if (x == Py_None) {
4698 *result = x;
4699 return 0;
4700 }
4701 else if (PyInt_Check(x)) {
4702 long value = PyInt_AS_LONG(x);
4703 long max = PyUnicode_GetMax();
4704 if (value < 0 || value > max) {
4705 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004706 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004707 Py_DECREF(x);
4708 return -1;
4709 }
4710 *result = x;
4711 return 0;
4712 }
4713 else if (PyUnicode_Check(x)) {
4714 *result = x;
4715 return 0;
4716 }
4717 else {
4718 /* wrong return value */
4719 PyErr_SetString(PyExc_TypeError,
4720 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004721 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 return -1;
4723 }
4724}
4725/* ensure that *outobj is at least requiredsize characters long,
4726if not reallocate and adjust various state variables.
4727Return 0 on success, -1 on error */
4728static
Walter Dörwald4894c302003-10-24 14:25:28 +00004729int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004730 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004731{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004732 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004733 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004735 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004737 if (requiredsize < 2 * oldsize)
4738 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004739 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 return -1;
4741 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 }
4743 return 0;
4744}
4745/* lookup the character, put the result in the output string and adjust
4746 various state variables. Return a new reference to the object that
4747 was put in the output buffer in *result, or Py_None, if the mapping was
4748 undefined (in which case no character was written).
4749 The called must decref result.
4750 Return 0 on success, -1 on error. */
4751static
Walter Dörwald4894c302003-10-24 14:25:28 +00004752int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004753 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004754 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755{
Walter Dörwald4894c302003-10-24 14:25:28 +00004756 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 return -1;
4758 if (*res==NULL) {
4759 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004760 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761 }
4762 else if (*res==Py_None)
4763 ;
4764 else if (PyInt_Check(*res)) {
4765 /* no overflow check, because we know that the space is enough */
4766 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4767 }
4768 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 if (repsize==1) {
4771 /* no overflow check, because we know that the space is enough */
4772 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4773 }
4774 else if (repsize!=0) {
4775 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004776 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004777 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004778 repsize - 1;
4779 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 return -1;
4781 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4782 *outp += repsize;
4783 }
4784 }
4785 else
4786 return -1;
4787 return 0;
4788}
4789
4790PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 PyObject *mapping,
4793 const char *errors)
4794{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 /* output object */
4796 PyObject *res = NULL;
4797 /* pointers to the beginning and end+1 of input */
4798 const Py_UNICODE *startp = p;
4799 const Py_UNICODE *endp = p + size;
4800 /* pointer into the output */
4801 Py_UNICODE *str;
4802 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 char *reason = "character maps to <undefined>";
4805 PyObject *errorHandler = NULL;
4806 PyObject *exc = NULL;
4807 /* the following variable is used for caching string comparisons
4808 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4809 * 3=ignore, 4=xmlcharrefreplace */
4810 int known_errorHandler = -1;
4811
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 if (mapping == NULL) {
4813 PyErr_BadArgument();
4814 return NULL;
4815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816
4817 /* allocate enough for a simple 1:1 translation without
4818 replacements, if we need more, we'll resize */
4819 res = PyUnicode_FromUnicode(NULL, size);
4820 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004821 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 return res;
4824 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 while (p<endp) {
4827 /* try to encode it */
4828 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004829 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 goto onError;
4832 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004833 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 if (x!=Py_None) /* it worked => adjust input pointer */
4835 ++p;
4836 else { /* untranslatable character */
4837 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004838 Py_ssize_t repsize;
4839 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004840 Py_UNICODE *uni2;
4841 /* startpos for collecting untranslatable chars */
4842 const Py_UNICODE *collstart = p;
4843 const Py_UNICODE *collend = p+1;
4844 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 /* find all untranslatable characters */
4847 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004848 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 goto onError;
4850 Py_XDECREF(x);
4851 if (x!=Py_None)
4852 break;
4853 ++collend;
4854 }
4855 /* cache callback name lookup
4856 * (if not done yet, i.e. it's the first error) */
4857 if (known_errorHandler==-1) {
4858 if ((errors==NULL) || (!strcmp(errors, "strict")))
4859 known_errorHandler = 1;
4860 else if (!strcmp(errors, "replace"))
4861 known_errorHandler = 2;
4862 else if (!strcmp(errors, "ignore"))
4863 known_errorHandler = 3;
4864 else if (!strcmp(errors, "xmlcharrefreplace"))
4865 known_errorHandler = 4;
4866 else
4867 known_errorHandler = 0;
4868 }
4869 switch (known_errorHandler) {
4870 case 1: /* strict */
4871 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4872 goto onError;
4873 case 2: /* replace */
4874 /* No need to check for space, this is a 1:1 replacement */
4875 for (coll = collstart; coll<collend; ++coll)
4876 *str++ = '?';
4877 /* fall through */
4878 case 3: /* ignore */
4879 p = collend;
4880 break;
4881 case 4: /* xmlcharrefreplace */
4882 /* generate replacement (temporarily (mis)uses p) */
4883 for (p = collstart; p < collend; ++p) {
4884 char buffer[2+29+1+1];
4885 char *cp;
4886 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004887 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004888 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4889 goto onError;
4890 for (cp = buffer; *cp; ++cp)
4891 *str++ = *cp;
4892 }
4893 p = collend;
4894 break;
4895 default:
4896 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4897 reason, startp, size, &exc,
4898 collstart-startp, collend-startp, &newpos);
4899 if (repunicode == NULL)
4900 goto onError;
4901 /* generate replacement */
4902 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004903 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4905 Py_DECREF(repunicode);
4906 goto onError;
4907 }
4908 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4909 *str++ = *uni2;
4910 p = startp + newpos;
4911 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 }
4913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915 /* Resize if we allocated to much */
4916 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004917 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004918 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004919 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 }
4921 Py_XDECREF(exc);
4922 Py_XDECREF(errorHandler);
4923 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 onError:
4926 Py_XDECREF(res);
4927 Py_XDECREF(exc);
4928 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929 return NULL;
4930}
4931
4932PyObject *PyUnicode_Translate(PyObject *str,
4933 PyObject *mapping,
4934 const char *errors)
4935{
4936 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004937
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 str = PyUnicode_FromObject(str);
4939 if (str == NULL)
4940 goto onError;
4941 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4942 PyUnicode_GET_SIZE(str),
4943 mapping,
4944 errors);
4945 Py_DECREF(str);
4946 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004947
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 onError:
4949 Py_XDECREF(str);
4950 return NULL;
4951}
Tim Petersced69f82003-09-16 20:30:58 +00004952
Guido van Rossum9e896b32000-04-05 20:11:21 +00004953/* --- Decimal Encoder ---------------------------------------------------- */
4954
4955int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004956 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004957 char *output,
4958 const char *errors)
4959{
4960 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 PyObject *errorHandler = NULL;
4962 PyObject *exc = NULL;
4963 const char *encoding = "decimal";
4964 const char *reason = "invalid decimal Unicode string";
4965 /* the following variable is used for caching string comparisons
4966 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4967 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004968
4969 if (output == NULL) {
4970 PyErr_BadArgument();
4971 return -1;
4972 }
4973
4974 p = s;
4975 end = s + length;
4976 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004978 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004980 Py_ssize_t repsize;
4981 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004982 Py_UNICODE *uni2;
4983 Py_UNICODE *collstart;
4984 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004985
Guido van Rossum9e896b32000-04-05 20:11:21 +00004986 if (Py_UNICODE_ISSPACE(ch)) {
4987 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004989 continue;
4990 }
4991 decimal = Py_UNICODE_TODECIMAL(ch);
4992 if (decimal >= 0) {
4993 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004995 continue;
4996 }
Guido van Rossumba477042000-04-06 18:18:10 +00004997 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004998 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004999 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005000 continue;
5001 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 /* All other characters are considered unencodable */
5003 collstart = p;
5004 collend = p+1;
5005 while (collend < end) {
5006 if ((0 < *collend && *collend < 256) ||
5007 !Py_UNICODE_ISSPACE(*collend) ||
5008 Py_UNICODE_TODECIMAL(*collend))
5009 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005010 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 /* cache callback name lookup
5012 * (if not done yet, i.e. it's the first error) */
5013 if (known_errorHandler==-1) {
5014 if ((errors==NULL) || (!strcmp(errors, "strict")))
5015 known_errorHandler = 1;
5016 else if (!strcmp(errors, "replace"))
5017 known_errorHandler = 2;
5018 else if (!strcmp(errors, "ignore"))
5019 known_errorHandler = 3;
5020 else if (!strcmp(errors, "xmlcharrefreplace"))
5021 known_errorHandler = 4;
5022 else
5023 known_errorHandler = 0;
5024 }
5025 switch (known_errorHandler) {
5026 case 1: /* strict */
5027 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5028 goto onError;
5029 case 2: /* replace */
5030 for (p = collstart; p < collend; ++p)
5031 *output++ = '?';
5032 /* fall through */
5033 case 3: /* ignore */
5034 p = collend;
5035 break;
5036 case 4: /* xmlcharrefreplace */
5037 /* generate replacement (temporarily (mis)uses p) */
5038 for (p = collstart; p < collend; ++p)
5039 output += sprintf(output, "&#%d;", (int)*p);
5040 p = collend;
5041 break;
5042 default:
5043 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5044 encoding, reason, s, length, &exc,
5045 collstart-s, collend-s, &newpos);
5046 if (repunicode == NULL)
5047 goto onError;
5048 /* generate replacement */
5049 repsize = PyUnicode_GET_SIZE(repunicode);
5050 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5051 Py_UNICODE ch = *uni2;
5052 if (Py_UNICODE_ISSPACE(ch))
5053 *output++ = ' ';
5054 else {
5055 decimal = Py_UNICODE_TODECIMAL(ch);
5056 if (decimal >= 0)
5057 *output++ = '0' + decimal;
5058 else if (0 < ch && ch < 256)
5059 *output++ = (char)ch;
5060 else {
5061 Py_DECREF(repunicode);
5062 raise_encode_exception(&exc, encoding,
5063 s, length, collstart-s, collend-s, reason);
5064 goto onError;
5065 }
5066 }
5067 }
5068 p = s + newpos;
5069 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005070 }
5071 }
5072 /* 0-terminate the output string */
5073 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074 Py_XDECREF(exc);
5075 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005076 return 0;
5077
5078 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 Py_XDECREF(exc);
5080 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005081 return -1;
5082}
5083
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084/* --- Helpers ------------------------------------------------------------ */
5085
Eric Smith8c663262007-08-25 02:26:07 +00005086#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005087
5088#include "stringlib/fastsearch.h"
5089
5090#include "stringlib/count.h"
5091#include "stringlib/find.h"
5092#include "stringlib/partition.h"
5093
5094/* helper macro to fixup start/end slice values */
5095#define FIX_START_END(obj) \
5096 if (start < 0) \
5097 start += (obj)->length; \
5098 if (start < 0) \
5099 start = 0; \
5100 if (end > (obj)->length) \
5101 end = (obj)->length; \
5102 if (end < 0) \
5103 end += (obj)->length; \
5104 if (end < 0) \
5105 end = 0;
5106
Martin v. Löwis18e16552006-02-15 17:27:45 +00005107Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005108 PyObject *substr,
5109 Py_ssize_t start,
5110 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005112 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005113 PyUnicodeObject* str_obj;
5114 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005115
Thomas Wouters477c8d52006-05-27 19:21:47 +00005116 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5117 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005119 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5120 if (!sub_obj) {
5121 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 return -1;
5123 }
Tim Petersced69f82003-09-16 20:30:58 +00005124
Thomas Wouters477c8d52006-05-27 19:21:47 +00005125 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005126
Thomas Wouters477c8d52006-05-27 19:21:47 +00005127 result = stringlib_count(
5128 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5129 );
5130
5131 Py_DECREF(sub_obj);
5132 Py_DECREF(str_obj);
5133
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 return result;
5135}
5136
Martin v. Löwis18e16552006-02-15 17:27:45 +00005137Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005138 PyObject *sub,
5139 Py_ssize_t start,
5140 Py_ssize_t end,
5141 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005146 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005147 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005148 sub = PyUnicode_FromObject(sub);
5149 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005150 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005151 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 }
Tim Petersced69f82003-09-16 20:30:58 +00005153
Thomas Wouters477c8d52006-05-27 19:21:47 +00005154 if (direction > 0)
5155 result = stringlib_find_slice(
5156 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5157 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5158 start, end
5159 );
5160 else
5161 result = stringlib_rfind_slice(
5162 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5163 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5164 start, end
5165 );
5166
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005168 Py_DECREF(sub);
5169
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 return result;
5171}
5172
Tim Petersced69f82003-09-16 20:30:58 +00005173static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174int tailmatch(PyUnicodeObject *self,
5175 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005176 Py_ssize_t start,
5177 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 int direction)
5179{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 if (substring->length == 0)
5181 return 1;
5182
Thomas Wouters477c8d52006-05-27 19:21:47 +00005183 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184
5185 end -= substring->length;
5186 if (end < start)
5187 return 0;
5188
5189 if (direction > 0) {
5190 if (Py_UNICODE_MATCH(self, end, substring))
5191 return 1;
5192 } else {
5193 if (Py_UNICODE_MATCH(self, start, substring))
5194 return 1;
5195 }
5196
5197 return 0;
5198}
5199
Martin v. Löwis18e16552006-02-15 17:27:45 +00005200Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 Py_ssize_t start,
5203 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 int direction)
5205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005207
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 str = PyUnicode_FromObject(str);
5209 if (str == NULL)
5210 return -1;
5211 substr = PyUnicode_FromObject(substr);
5212 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005213 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 return -1;
5215 }
Tim Petersced69f82003-09-16 20:30:58 +00005216
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 result = tailmatch((PyUnicodeObject *)str,
5218 (PyUnicodeObject *)substr,
5219 start, end, direction);
5220 Py_DECREF(str);
5221 Py_DECREF(substr);
5222 return result;
5223}
5224
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225/* Apply fixfct filter to the Unicode object self and return a
5226 reference to the modified object */
5227
Tim Petersced69f82003-09-16 20:30:58 +00005228static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229PyObject *fixup(PyUnicodeObject *self,
5230 int (*fixfct)(PyUnicodeObject *s))
5231{
5232
5233 PyUnicodeObject *u;
5234
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005235 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 if (u == NULL)
5237 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005238
5239 Py_UNICODE_COPY(u->str, self->str, self->length);
5240
Tim Peters7a29bd52001-09-12 03:03:31 +00005241 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 /* fixfct should return TRUE if it modified the buffer. If
5243 FALSE, return a reference to the original buffer instead
5244 (to save space, not time) */
5245 Py_INCREF(self);
5246 Py_DECREF(u);
5247 return (PyObject*) self;
5248 }
5249 return (PyObject*) u;
5250}
5251
Tim Petersced69f82003-09-16 20:30:58 +00005252static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253int fixupper(PyUnicodeObject *self)
5254{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005255 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 Py_UNICODE *s = self->str;
5257 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 while (len-- > 0) {
5260 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005261
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 ch = Py_UNICODE_TOUPPER(*s);
5263 if (ch != *s) {
5264 status = 1;
5265 *s = ch;
5266 }
5267 s++;
5268 }
5269
5270 return status;
5271}
5272
Tim Petersced69f82003-09-16 20:30:58 +00005273static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274int fixlower(PyUnicodeObject *self)
5275{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005276 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 Py_UNICODE *s = self->str;
5278 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 while (len-- > 0) {
5281 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005282
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 ch = Py_UNICODE_TOLOWER(*s);
5284 if (ch != *s) {
5285 status = 1;
5286 *s = ch;
5287 }
5288 s++;
5289 }
5290
5291 return status;
5292}
5293
Tim Petersced69f82003-09-16 20:30:58 +00005294static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295int fixswapcase(PyUnicodeObject *self)
5296{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 Py_UNICODE *s = self->str;
5299 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 while (len-- > 0) {
5302 if (Py_UNICODE_ISUPPER(*s)) {
5303 *s = Py_UNICODE_TOLOWER(*s);
5304 status = 1;
5305 } else if (Py_UNICODE_ISLOWER(*s)) {
5306 *s = Py_UNICODE_TOUPPER(*s);
5307 status = 1;
5308 }
5309 s++;
5310 }
5311
5312 return status;
5313}
5314
Tim Petersced69f82003-09-16 20:30:58 +00005315static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316int fixcapitalize(PyUnicodeObject *self)
5317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005318 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005319 Py_UNICODE *s = self->str;
5320 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005321
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005322 if (len == 0)
5323 return 0;
5324 if (Py_UNICODE_ISLOWER(*s)) {
5325 *s = Py_UNICODE_TOUPPER(*s);
5326 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005328 s++;
5329 while (--len > 0) {
5330 if (Py_UNICODE_ISUPPER(*s)) {
5331 *s = Py_UNICODE_TOLOWER(*s);
5332 status = 1;
5333 }
5334 s++;
5335 }
5336 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337}
5338
5339static
5340int fixtitle(PyUnicodeObject *self)
5341{
5342 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5343 register Py_UNICODE *e;
5344 int previous_is_cased;
5345
5346 /* Shortcut for single character strings */
5347 if (PyUnicode_GET_SIZE(self) == 1) {
5348 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5349 if (*p != ch) {
5350 *p = ch;
5351 return 1;
5352 }
5353 else
5354 return 0;
5355 }
Tim Petersced69f82003-09-16 20:30:58 +00005356
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 e = p + PyUnicode_GET_SIZE(self);
5358 previous_is_cased = 0;
5359 for (; p < e; p++) {
5360 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005361
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 if (previous_is_cased)
5363 *p = Py_UNICODE_TOLOWER(ch);
5364 else
5365 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005366
5367 if (Py_UNICODE_ISLOWER(ch) ||
5368 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 Py_UNICODE_ISTITLE(ch))
5370 previous_is_cased = 1;
5371 else
5372 previous_is_cased = 0;
5373 }
5374 return 1;
5375}
5376
Tim Peters8ce9f162004-08-27 01:49:32 +00005377PyObject *
5378PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379{
Tim Peters8ce9f162004-08-27 01:49:32 +00005380 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005381 const Py_UNICODE blank = ' ';
5382 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005383 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005384 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005385 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5386 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005387 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5388 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005389 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005390 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005391 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
Tim Peters05eba1f2004-08-27 21:32:02 +00005393 fseq = PySequence_Fast(seq, "");
5394 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005395 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005396 }
5397
Tim Peters91879ab2004-08-27 22:35:44 +00005398 /* Grrrr. A codec may be invoked to convert str objects to
5399 * Unicode, and so it's possible to call back into Python code
5400 * during PyUnicode_FromObject(), and so it's possible for a sick
5401 * codec to change the size of fseq (if seq is a list). Therefore
5402 * we have to keep refetching the size -- can't assume seqlen
5403 * is invariant.
5404 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005405 seqlen = PySequence_Fast_GET_SIZE(fseq);
5406 /* If empty sequence, return u"". */
5407 if (seqlen == 0) {
5408 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5409 goto Done;
5410 }
5411 /* If singleton sequence with an exact Unicode, return that. */
5412 if (seqlen == 1) {
5413 item = PySequence_Fast_GET_ITEM(fseq, 0);
5414 if (PyUnicode_CheckExact(item)) {
5415 Py_INCREF(item);
5416 res = (PyUnicodeObject *)item;
5417 goto Done;
5418 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005419 }
5420
Tim Peters05eba1f2004-08-27 21:32:02 +00005421 /* At least two items to join, or one that isn't exact Unicode. */
5422 if (seqlen > 1) {
5423 /* Set up sep and seplen -- they're needed. */
5424 if (separator == NULL) {
5425 sep = &blank;
5426 seplen = 1;
5427 }
5428 else {
5429 internal_separator = PyUnicode_FromObject(separator);
5430 if (internal_separator == NULL)
5431 goto onError;
5432 sep = PyUnicode_AS_UNICODE(internal_separator);
5433 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005434 /* In case PyUnicode_FromObject() mutated seq. */
5435 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005436 }
5437 }
5438
5439 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005440 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005441 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005442 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005443 res_p = PyUnicode_AS_UNICODE(res);
5444 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005445
Tim Peters05eba1f2004-08-27 21:32:02 +00005446 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005447 Py_ssize_t itemlen;
5448 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005449
5450 item = PySequence_Fast_GET_ITEM(fseq, i);
5451 /* Convert item to Unicode. */
Guido van Rossumf1044292007-09-27 18:01:22 +00005452 if (!PyString_Check(item) && !PyUnicode_Check(item))
5453 {
5454 if (PyBytes_Check(item))
5455 {
5456 PyErr_Format(PyExc_TypeError,
5457 "sequence item %d: join() will not operate on "
5458 "bytes objects", i);
5459 goto onError;
5460 }
5461 item = PyObject_Unicode(item);
Tim Peters8ce9f162004-08-27 01:49:32 +00005462 }
Guido van Rossumf1044292007-09-27 18:01:22 +00005463 else
5464 item = PyUnicode_FromObject(item);
5465
Tim Peters05eba1f2004-08-27 21:32:02 +00005466 if (item == NULL)
5467 goto onError;
5468 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005469
Tim Peters91879ab2004-08-27 22:35:44 +00005470 /* In case PyUnicode_FromObject() mutated seq. */
5471 seqlen = PySequence_Fast_GET_SIZE(fseq);
5472
Tim Peters8ce9f162004-08-27 01:49:32 +00005473 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005475 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005476 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005477 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005478 if (i < seqlen - 1) {
5479 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005480 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005481 goto Overflow;
5482 }
5483 if (new_res_used > res_alloc) {
5484 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005485 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005486 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005487 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005488 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005489 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005490 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005491 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005493 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005494 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005496
5497 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005498 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005499 res_p += itemlen;
5500 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005501 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005502 res_p += seplen;
5503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 res_used = new_res_used;
5506 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005507
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 /* Shrink res to match the used area; this probably can't fail,
5509 * but it's cheap to check.
5510 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005511 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005512 goto onError;
5513
5514 Done:
5515 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 return (PyObject *)res;
5518
Tim Peters8ce9f162004-08-27 01:49:32 +00005519 Overflow:
5520 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005521 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005522 Py_DECREF(item);
5523 /* fall through */
5524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005526 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005527 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005528 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 return NULL;
5530}
5531
Tim Petersced69f82003-09-16 20:30:58 +00005532static
5533PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005534 Py_ssize_t left,
5535 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 Py_UNICODE fill)
5537{
5538 PyUnicodeObject *u;
5539
5540 if (left < 0)
5541 left = 0;
5542 if (right < 0)
5543 right = 0;
5544
Tim Peters7a29bd52001-09-12 03:03:31 +00005545 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 Py_INCREF(self);
5547 return self;
5548 }
5549
5550 u = _PyUnicode_New(left + self->length + right);
5551 if (u) {
5552 if (left)
5553 Py_UNICODE_FILL(u->str, fill, left);
5554 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5555 if (right)
5556 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5557 }
5558
5559 return u;
5560}
5561
5562#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005563 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 if (!str) \
5565 goto onError; \
5566 if (PyList_Append(list, str)) { \
5567 Py_DECREF(str); \
5568 goto onError; \
5569 } \
5570 else \
5571 Py_DECREF(str);
5572
5573static
5574PyObject *split_whitespace(PyUnicodeObject *self,
5575 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005576 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005578 register Py_ssize_t i;
5579 register Py_ssize_t j;
5580 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 PyObject *str;
5582
5583 for (i = j = 0; i < len; ) {
5584 /* find a token */
5585 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5586 i++;
5587 j = i;
5588 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5589 i++;
5590 if (j < i) {
5591 if (maxcount-- <= 0)
5592 break;
5593 SPLIT_APPEND(self->str, j, i);
5594 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5595 i++;
5596 j = i;
5597 }
5598 }
5599 if (j < len) {
5600 SPLIT_APPEND(self->str, j, len);
5601 }
5602 return list;
5603
5604 onError:
5605 Py_DECREF(list);
5606 return NULL;
5607}
5608
5609PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005610 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005612 register Py_ssize_t i;
5613 register Py_ssize_t j;
5614 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 PyObject *list;
5616 PyObject *str;
5617 Py_UNICODE *data;
5618
5619 string = PyUnicode_FromObject(string);
5620 if (string == NULL)
5621 return NULL;
5622 data = PyUnicode_AS_UNICODE(string);
5623 len = PyUnicode_GET_SIZE(string);
5624
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 list = PyList_New(0);
5626 if (!list)
5627 goto onError;
5628
5629 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005630 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005631
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005633 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
5636 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005637 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 if (i < len) {
5639 if (data[i] == '\r' && i + 1 < len &&
5640 data[i+1] == '\n')
5641 i += 2;
5642 else
5643 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005644 if (keepends)
5645 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 }
Guido van Rossum86662912000-04-11 15:38:46 +00005647 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 j = i;
5649 }
5650 if (j < len) {
5651 SPLIT_APPEND(data, j, len);
5652 }
5653
5654 Py_DECREF(string);
5655 return list;
5656
5657 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005658 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 Py_DECREF(string);
5660 return NULL;
5661}
5662
Tim Petersced69f82003-09-16 20:30:58 +00005663static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664PyObject *split_char(PyUnicodeObject *self,
5665 PyObject *list,
5666 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005667 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005669 register Py_ssize_t i;
5670 register Py_ssize_t j;
5671 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 PyObject *str;
5673
5674 for (i = j = 0; i < len; ) {
5675 if (self->str[i] == ch) {
5676 if (maxcount-- <= 0)
5677 break;
5678 SPLIT_APPEND(self->str, j, i);
5679 i = j = i + 1;
5680 } else
5681 i++;
5682 }
5683 if (j <= len) {
5684 SPLIT_APPEND(self->str, j, len);
5685 }
5686 return list;
5687
5688 onError:
5689 Py_DECREF(list);
5690 return NULL;
5691}
5692
Tim Petersced69f82003-09-16 20:30:58 +00005693static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694PyObject *split_substring(PyUnicodeObject *self,
5695 PyObject *list,
5696 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005697 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005699 register Py_ssize_t i;
5700 register Py_ssize_t j;
5701 Py_ssize_t len = self->length;
5702 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 PyObject *str;
5704
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005705 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 if (Py_UNICODE_MATCH(self, i, substring)) {
5707 if (maxcount-- <= 0)
5708 break;
5709 SPLIT_APPEND(self->str, j, i);
5710 i = j = i + sublen;
5711 } else
5712 i++;
5713 }
5714 if (j <= len) {
5715 SPLIT_APPEND(self->str, j, len);
5716 }
5717 return list;
5718
5719 onError:
5720 Py_DECREF(list);
5721 return NULL;
5722}
5723
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005724static
5725PyObject *rsplit_whitespace(PyUnicodeObject *self,
5726 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005727 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005728{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005729 register Py_ssize_t i;
5730 register Py_ssize_t j;
5731 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005732 PyObject *str;
5733
5734 for (i = j = len - 1; i >= 0; ) {
5735 /* find a token */
5736 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5737 i--;
5738 j = i;
5739 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5740 i--;
5741 if (j > i) {
5742 if (maxcount-- <= 0)
5743 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005744 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005745 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5746 i--;
5747 j = i;
5748 }
5749 }
5750 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005751 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005752 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005753 if (PyList_Reverse(list) < 0)
5754 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005755 return list;
5756
5757 onError:
5758 Py_DECREF(list);
5759 return NULL;
5760}
5761
5762static
5763PyObject *rsplit_char(PyUnicodeObject *self,
5764 PyObject *list,
5765 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005766 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005768 register Py_ssize_t i;
5769 register Py_ssize_t j;
5770 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771 PyObject *str;
5772
5773 for (i = j = len - 1; i >= 0; ) {
5774 if (self->str[i] == ch) {
5775 if (maxcount-- <= 0)
5776 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005777 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778 j = i = i - 1;
5779 } else
5780 i--;
5781 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005782 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005783 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005784 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005785 if (PyList_Reverse(list) < 0)
5786 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005787 return list;
5788
5789 onError:
5790 Py_DECREF(list);
5791 return NULL;
5792}
5793
5794static
5795PyObject *rsplit_substring(PyUnicodeObject *self,
5796 PyObject *list,
5797 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005800 register Py_ssize_t i;
5801 register Py_ssize_t j;
5802 Py_ssize_t len = self->length;
5803 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005804 PyObject *str;
5805
5806 for (i = len - sublen, j = len; i >= 0; ) {
5807 if (Py_UNICODE_MATCH(self, i, substring)) {
5808 if (maxcount-- <= 0)
5809 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005810 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005811 j = i;
5812 i -= sublen;
5813 } else
5814 i--;
5815 }
5816 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005817 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005819 if (PyList_Reverse(list) < 0)
5820 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005821 return list;
5822
5823 onError:
5824 Py_DECREF(list);
5825 return NULL;
5826}
5827
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828#undef SPLIT_APPEND
5829
5830static
5831PyObject *split(PyUnicodeObject *self,
5832 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834{
5835 PyObject *list;
5836
5837 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005838 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
5840 list = PyList_New(0);
5841 if (!list)
5842 return NULL;
5843
5844 if (substring == NULL)
5845 return split_whitespace(self,list,maxcount);
5846
5847 else if (substring->length == 1)
5848 return split_char(self,list,substring->str[0],maxcount);
5849
5850 else if (substring->length == 0) {
5851 Py_DECREF(list);
5852 PyErr_SetString(PyExc_ValueError, "empty separator");
5853 return NULL;
5854 }
5855 else
5856 return split_substring(self,list,substring,maxcount);
5857}
5858
Tim Petersced69f82003-09-16 20:30:58 +00005859static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860PyObject *rsplit(PyUnicodeObject *self,
5861 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005863{
5864 PyObject *list;
5865
5866 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005867 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868
5869 list = PyList_New(0);
5870 if (!list)
5871 return NULL;
5872
5873 if (substring == NULL)
5874 return rsplit_whitespace(self,list,maxcount);
5875
5876 else if (substring->length == 1)
5877 return rsplit_char(self,list,substring->str[0],maxcount);
5878
5879 else if (substring->length == 0) {
5880 Py_DECREF(list);
5881 PyErr_SetString(PyExc_ValueError, "empty separator");
5882 return NULL;
5883 }
5884 else
5885 return rsplit_substring(self,list,substring,maxcount);
5886}
5887
5888static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889PyObject *replace(PyUnicodeObject *self,
5890 PyUnicodeObject *str1,
5891 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005892 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893{
5894 PyUnicodeObject *u;
5895
5896 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005897 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898
Thomas Wouters477c8d52006-05-27 19:21:47 +00005899 if (str1->length == str2->length) {
5900 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005901 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005902 if (str1->length == 1) {
5903 /* replace characters */
5904 Py_UNICODE u1, u2;
5905 if (!findchar(self->str, self->length, str1->str[0]))
5906 goto nothing;
5907 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5908 if (!u)
5909 return NULL;
5910 Py_UNICODE_COPY(u->str, self->str, self->length);
5911 u1 = str1->str[0];
5912 u2 = str2->str[0];
5913 for (i = 0; i < u->length; i++)
5914 if (u->str[i] == u1) {
5915 if (--maxcount < 0)
5916 break;
5917 u->str[i] = u2;
5918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005920 i = fastsearch(
5921 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005923 if (i < 0)
5924 goto nothing;
5925 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5926 if (!u)
5927 return NULL;
5928 Py_UNICODE_COPY(u->str, self->str, self->length);
5929 while (i <= self->length - str1->length)
5930 if (Py_UNICODE_MATCH(self, i, str1)) {
5931 if (--maxcount < 0)
5932 break;
5933 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5934 i += str1->length;
5935 } else
5936 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005939
5940 Py_ssize_t n, i, j, e;
5941 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 Py_UNICODE *p;
5943
5944 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005945 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 if (n > maxcount)
5947 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005948 if (n == 0)
5949 goto nothing;
5950 /* new_size = self->length + n * (str2->length - str1->length)); */
5951 delta = (str2->length - str1->length);
5952 if (delta == 0) {
5953 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005955 product = n * (str2->length - str1->length);
5956 if ((product / (str2->length - str1->length)) != n) {
5957 PyErr_SetString(PyExc_OverflowError,
5958 "replace string is too long");
5959 return NULL;
5960 }
5961 new_size = self->length + product;
5962 if (new_size < 0) {
5963 PyErr_SetString(PyExc_OverflowError,
5964 "replace string is too long");
5965 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 }
5967 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005968 u = _PyUnicode_New(new_size);
5969 if (!u)
5970 return NULL;
5971 i = 0;
5972 p = u->str;
5973 e = self->length - str1->length;
5974 if (str1->length > 0) {
5975 while (n-- > 0) {
5976 /* look for next match */
5977 j = i;
5978 while (j <= e) {
5979 if (Py_UNICODE_MATCH(self, j, str1))
5980 break;
5981 j++;
5982 }
5983 if (j > i) {
5984 if (j > e)
5985 break;
5986 /* copy unchanged part [i:j] */
5987 Py_UNICODE_COPY(p, self->str+i, j-i);
5988 p += j - i;
5989 }
5990 /* copy substitution string */
5991 if (str2->length > 0) {
5992 Py_UNICODE_COPY(p, str2->str, str2->length);
5993 p += str2->length;
5994 }
5995 i = j + str1->length;
5996 }
5997 if (i < self->length)
5998 /* copy tail [i:] */
5999 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6000 } else {
6001 /* interleave */
6002 while (n > 0) {
6003 Py_UNICODE_COPY(p, str2->str, str2->length);
6004 p += str2->length;
6005 if (--n <= 0)
6006 break;
6007 *p++ = self->str[i++];
6008 }
6009 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006013
6014nothing:
6015 /* nothing to replace; return original string (when possible) */
6016 if (PyUnicode_CheckExact(self)) {
6017 Py_INCREF(self);
6018 return (PyObject *) self;
6019 }
6020 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021}
6022
6023/* --- Unicode Object Methods --------------------------------------------- */
6024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006025PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026"S.title() -> unicode\n\
6027\n\
6028Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006029characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
6031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006032unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 return fixup(self, fixtitle);
6035}
6036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006037PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038"S.capitalize() -> unicode\n\
6039\n\
6040Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006041have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
6043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006044unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 return fixup(self, fixcapitalize);
6047}
6048
6049#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006050PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051"S.capwords() -> unicode\n\
6052\n\
6053Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006054normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055
6056static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006057unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058{
6059 PyObject *list;
6060 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006061 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 /* Split into words */
6064 list = split(self, NULL, -1);
6065 if (!list)
6066 return NULL;
6067
6068 /* Capitalize each word */
6069 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6070 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6071 fixcapitalize);
6072 if (item == NULL)
6073 goto onError;
6074 Py_DECREF(PyList_GET_ITEM(list, i));
6075 PyList_SET_ITEM(list, i, item);
6076 }
6077
6078 /* Join the words to form a new string */
6079 item = PyUnicode_Join(NULL, list);
6080
6081onError:
6082 Py_DECREF(list);
6083 return (PyObject *)item;
6084}
6085#endif
6086
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006087/* Argument converter. Coerces to a single unicode character */
6088
6089static int
6090convert_uc(PyObject *obj, void *addr)
6091{
6092 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6093 PyObject *uniobj;
6094 Py_UNICODE *unistr;
6095
6096 uniobj = PyUnicode_FromObject(obj);
6097 if (uniobj == NULL) {
6098 PyErr_SetString(PyExc_TypeError,
6099 "The fill character cannot be converted to Unicode");
6100 return 0;
6101 }
6102 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6103 PyErr_SetString(PyExc_TypeError,
6104 "The fill character must be exactly one character long");
6105 Py_DECREF(uniobj);
6106 return 0;
6107 }
6108 unistr = PyUnicode_AS_UNICODE(uniobj);
6109 *fillcharloc = unistr[0];
6110 Py_DECREF(uniobj);
6111 return 1;
6112}
6113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006114PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006115"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006117Return S centered in a Unicode string of length width. Padding is\n\
6118done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119
6120static PyObject *
6121unicode_center(PyUnicodeObject *self, PyObject *args)
6122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006123 Py_ssize_t marg, left;
6124 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006125 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126
Thomas Woutersde017742006-02-16 19:34:37 +00006127 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 return NULL;
6129
Tim Peters7a29bd52001-09-12 03:03:31 +00006130 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 Py_INCREF(self);
6132 return (PyObject*) self;
6133 }
6134
6135 marg = width - self->length;
6136 left = marg / 2 + (marg & width & 1);
6137
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006138 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139}
6140
Marc-André Lemburge5034372000-08-08 08:04:29 +00006141#if 0
6142
6143/* This code should go into some future Unicode collation support
6144 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006145 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006146
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006147/* speedy UTF-16 code point order comparison */
6148/* gleaned from: */
6149/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6150
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006151static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006152{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006153 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006154 0, 0, 0, 0, 0, 0, 0, 0,
6155 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006156 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006157};
6158
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159static int
6160unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6161{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006162 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006163
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 Py_UNICODE *s1 = str1->str;
6165 Py_UNICODE *s2 = str2->str;
6166
6167 len1 = str1->length;
6168 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006169
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006171 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006172
6173 c1 = *s1++;
6174 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006175
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006176 if (c1 > (1<<11) * 26)
6177 c1 += utf16Fixup[c1>>11];
6178 if (c2 > (1<<11) * 26)
6179 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006180 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006181
6182 if (c1 != c2)
6183 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006184
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006185 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 }
6187
6188 return (len1 < len2) ? -1 : (len1 != len2);
6189}
6190
Marc-André Lemburge5034372000-08-08 08:04:29 +00006191#else
6192
6193static int
6194unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6195{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006196 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006197
6198 Py_UNICODE *s1 = str1->str;
6199 Py_UNICODE *s2 = str2->str;
6200
6201 len1 = str1->length;
6202 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006203
Marc-André Lemburge5034372000-08-08 08:04:29 +00006204 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006205 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006206
Fredrik Lundh45714e92001-06-26 16:39:36 +00006207 c1 = *s1++;
6208 c2 = *s2++;
6209
6210 if (c1 != c2)
6211 return (c1 < c2) ? -1 : 1;
6212
Marc-André Lemburge5034372000-08-08 08:04:29 +00006213 len1--; len2--;
6214 }
6215
6216 return (len1 < len2) ? -1 : (len1 != len2);
6217}
6218
6219#endif
6220
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221int PyUnicode_Compare(PyObject *left,
6222 PyObject *right)
6223{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006224 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6225 return unicode_compare((PyUnicodeObject *)left,
6226 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006227 PyErr_Format(PyExc_TypeError,
6228 "Can't compare %.100s and %.100s",
6229 left->ob_type->tp_name,
6230 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 return -1;
6232}
6233
Martin v. Löwis5b222132007-06-10 09:51:05 +00006234int
6235PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6236{
6237 int i;
6238 Py_UNICODE *id;
6239 assert(PyUnicode_Check(uni));
6240 id = PyUnicode_AS_UNICODE(uni);
6241 /* Compare Unicode string and source character set string */
6242 for (i = 0; id[i] && str[i]; i++)
6243 if (id[i] != str[i])
6244 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6245 if (id[i])
6246 return 1; /* uni is longer */
6247 if (str[i])
6248 return -1; /* str is longer */
6249 return 0;
6250}
6251
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006252PyObject *PyUnicode_RichCompare(PyObject *left,
6253 PyObject *right,
6254 int op)
6255{
6256 int result;
6257
6258 result = PyUnicode_Compare(left, right);
6259 if (result == -1 && PyErr_Occurred())
6260 goto onError;
6261
6262 /* Convert the return value to a Boolean */
6263 switch (op) {
6264 case Py_EQ:
6265 result = (result == 0);
6266 break;
6267 case Py_NE:
6268 result = (result != 0);
6269 break;
6270 case Py_LE:
6271 result = (result <= 0);
6272 break;
6273 case Py_GE:
6274 result = (result >= 0);
6275 break;
6276 case Py_LT:
6277 result = (result == -1);
6278 break;
6279 case Py_GT:
6280 result = (result == 1);
6281 break;
6282 }
6283 return PyBool_FromLong(result);
6284
6285 onError:
6286
6287 /* Standard case
6288
6289 Type errors mean that PyUnicode_FromObject() could not convert
6290 one of the arguments (usually the right hand side) to Unicode,
6291 ie. we can't handle the comparison request. However, it is
6292 possible that the other object knows a comparison method, which
6293 is why we return Py_NotImplemented to give the other object a
6294 chance.
6295
6296 */
6297 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6298 PyErr_Clear();
6299 Py_INCREF(Py_NotImplemented);
6300 return Py_NotImplemented;
6301 }
6302 if (op != Py_EQ && op != Py_NE)
6303 return NULL;
6304
6305 /* Equality comparison.
6306
6307 This is a special case: we silence any PyExc_UnicodeDecodeError
6308 and instead turn it into a PyErr_UnicodeWarning.
6309
6310 */
6311 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6312 return NULL;
6313 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006314 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6315 (op == Py_EQ) ?
6316 "Unicode equal comparison "
6317 "failed to convert both arguments to Unicode - "
6318 "interpreting them as being unequal"
6319 :
6320 "Unicode unequal comparison "
6321 "failed to convert both arguments to Unicode - "
6322 "interpreting them as being unequal",
6323 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006324 return NULL;
6325 result = (op == Py_NE);
6326 return PyBool_FromLong(result);
6327}
6328
Guido van Rossum403d68b2000-03-13 15:55:09 +00006329int PyUnicode_Contains(PyObject *container,
6330 PyObject *element)
6331{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006332 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006333 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006334
6335 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006336 sub = PyUnicode_FromObject(element);
6337 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006338 PyErr_Format(PyExc_TypeError,
6339 "'in <string>' requires string as left operand, not %s",
6340 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006341 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006342 }
6343
Thomas Wouters477c8d52006-05-27 19:21:47 +00006344 str = PyUnicode_FromObject(container);
6345 if (!str) {
6346 Py_DECREF(sub);
6347 return -1;
6348 }
6349
6350 result = stringlib_contains_obj(str, sub);
6351
6352 Py_DECREF(str);
6353 Py_DECREF(sub);
6354
Guido van Rossum403d68b2000-03-13 15:55:09 +00006355 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006356}
6357
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358/* Concat to string or Unicode object giving a new Unicode object. */
6359
6360PyObject *PyUnicode_Concat(PyObject *left,
6361 PyObject *right)
6362{
6363 PyUnicodeObject *u = NULL, *v = NULL, *w;
6364
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006365 if (PyBytes_Check(left) || PyBytes_Check(right))
6366 return PyBytes_Concat(left, right);
6367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 /* Coerce the two arguments */
6369 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6370 if (u == NULL)
6371 goto onError;
6372 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6373 if (v == NULL)
6374 goto onError;
6375
6376 /* Shortcuts */
6377 if (v == unicode_empty) {
6378 Py_DECREF(v);
6379 return (PyObject *)u;
6380 }
6381 if (u == unicode_empty) {
6382 Py_DECREF(u);
6383 return (PyObject *)v;
6384 }
6385
6386 /* Concat the two Unicode strings */
6387 w = _PyUnicode_New(u->length + v->length);
6388 if (w == NULL)
6389 goto onError;
6390 Py_UNICODE_COPY(w->str, u->str, u->length);
6391 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6392
6393 Py_DECREF(u);
6394 Py_DECREF(v);
6395 return (PyObject *)w;
6396
6397onError:
6398 Py_XDECREF(u);
6399 Py_XDECREF(v);
6400 return NULL;
6401}
6402
Walter Dörwald1ab83302007-05-18 17:15:44 +00006403void
6404PyUnicode_Append(PyObject **pleft, PyObject *right)
6405{
6406 PyObject *new;
6407 if (*pleft == NULL)
6408 return;
6409 if (right == NULL || !PyUnicode_Check(*pleft)) {
6410 Py_DECREF(*pleft);
6411 *pleft = NULL;
6412 return;
6413 }
6414 new = PyUnicode_Concat(*pleft, right);
6415 Py_DECREF(*pleft);
6416 *pleft = new;
6417}
6418
6419void
6420PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6421{
6422 PyUnicode_Append(pleft, right);
6423 Py_XDECREF(right);
6424}
6425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006426PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427"S.count(sub[, start[, end]]) -> int\n\
6428\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006429Return the number of non-overlapping occurrences of substring sub in\n\
6430Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006431interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
6433static PyObject *
6434unicode_count(PyUnicodeObject *self, PyObject *args)
6435{
6436 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006437 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006438 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 PyObject *result;
6440
Guido van Rossumb8872e62000-05-09 14:14:27 +00006441 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6442 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 return NULL;
6444
6445 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006446 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 if (substring == NULL)
6448 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006449
Thomas Wouters477c8d52006-05-27 19:21:47 +00006450 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451
Thomas Wouters477c8d52006-05-27 19:21:47 +00006452 result = PyInt_FromSsize_t(
6453 stringlib_count(self->str + start, end - start,
6454 substring->str, substring->length)
6455 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456
6457 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 return result;
6460}
6461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006462PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006463"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006465Encodes S using the codec registered for encoding. encoding defaults\n\
6466to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006467handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6469'xmlcharrefreplace' as well as any other name registered with\n\
6470codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
6472static PyObject *
6473unicode_encode(PyUnicodeObject *self, PyObject *args)
6474{
6475 char *encoding = NULL;
6476 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006477 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006478
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6480 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006481 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006482 if (v == NULL)
6483 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006484 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006485 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006486 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006487 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006488 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006489 Py_DECREF(v);
6490 return NULL;
6491 }
6492 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006493
6494 onError:
6495 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006496}
6497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006498PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499"S.expandtabs([tabsize]) -> unicode\n\
6500\n\
6501Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006502If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503
6504static PyObject*
6505unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6506{
6507 Py_UNICODE *e;
6508 Py_UNICODE *p;
6509 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006510 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 PyUnicodeObject *u;
6512 int tabsize = 8;
6513
6514 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6515 return NULL;
6516
Thomas Wouters7e474022000-07-16 12:04:32 +00006517 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006518 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 e = self->str + self->length;
6520 for (p = self->str; p < e; p++)
6521 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006522 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006524 if (old_j > j) {
6525 PyErr_SetString(PyExc_OverflowError,
6526 "new string is too long");
6527 return NULL;
6528 }
6529 old_j = j;
6530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 }
6532 else {
6533 j++;
6534 if (*p == '\n' || *p == '\r') {
6535 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006536 old_j = j = 0;
6537 if (i < 0) {
6538 PyErr_SetString(PyExc_OverflowError,
6539 "new string is too long");
6540 return NULL;
6541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 }
6543 }
6544
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006545 if ((i + j) < 0) {
6546 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6547 return NULL;
6548 }
6549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 /* Second pass: create output string and fill it */
6551 u = _PyUnicode_New(i + j);
6552 if (!u)
6553 return NULL;
6554
6555 j = 0;
6556 q = u->str;
6557
6558 for (p = self->str; p < e; p++)
6559 if (*p == '\t') {
6560 if (tabsize > 0) {
6561 i = tabsize - (j % tabsize);
6562 j += i;
6563 while (i--)
6564 *q++ = ' ';
6565 }
6566 }
6567 else {
6568 j++;
6569 *q++ = *p;
6570 if (*p == '\n' || *p == '\r')
6571 j = 0;
6572 }
6573
6574 return (PyObject*) u;
6575}
6576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006577PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578"S.find(sub [,start [,end]]) -> int\n\
6579\n\
6580Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006581such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582arguments start and end are interpreted as in slice notation.\n\
6583\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006584Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585
6586static PyObject *
6587unicode_find(PyUnicodeObject *self, PyObject *args)
6588{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006589 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006590 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006591 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006592 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593
Guido van Rossumb8872e62000-05-09 14:14:27 +00006594 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6595 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006597 substring = PyUnicode_FromObject(substring);
6598 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 return NULL;
6600
Thomas Wouters477c8d52006-05-27 19:21:47 +00006601 result = stringlib_find_slice(
6602 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6603 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6604 start, end
6605 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606
6607 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006608
6609 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610}
6611
6612static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006613unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614{
6615 if (index < 0 || index >= self->length) {
6616 PyErr_SetString(PyExc_IndexError, "string index out of range");
6617 return NULL;
6618 }
6619
6620 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6621}
6622
Guido van Rossumc2504932007-09-18 19:42:40 +00006623/* Believe it or not, this produces the same value for ASCII strings
6624 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006626unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627{
Guido van Rossumc2504932007-09-18 19:42:40 +00006628 Py_ssize_t len;
6629 Py_UNICODE *p;
6630 long x;
6631
6632 if (self->hash != -1)
6633 return self->hash;
6634 len = Py_Size(self);
6635 p = self->str;
6636 x = *p << 7;
6637 while (--len >= 0)
6638 x = (1000003*x) ^ *p++;
6639 x ^= Py_Size(self);
6640 if (x == -1)
6641 x = -2;
6642 self->hash = x;
6643 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006646PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647"S.index(sub [,start [,end]]) -> int\n\
6648\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006649Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650
6651static PyObject *
6652unicode_index(PyUnicodeObject *self, PyObject *args)
6653{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006654 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006655 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006656 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006657 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658
Guido van Rossumb8872e62000-05-09 14:14:27 +00006659 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6660 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006662 substring = PyUnicode_FromObject(substring);
6663 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 return NULL;
6665
Thomas Wouters477c8d52006-05-27 19:21:47 +00006666 result = stringlib_find_slice(
6667 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6668 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6669 start, end
6670 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006673
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 if (result < 0) {
6675 PyErr_SetString(PyExc_ValueError, "substring not found");
6676 return NULL;
6677 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006678
Martin v. Löwis18e16552006-02-15 17:27:45 +00006679 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680}
6681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006682PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006683"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006685Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006686at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
6688static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006689unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690{
6691 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6692 register const Py_UNICODE *e;
6693 int cased;
6694
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 /* Shortcut for single character strings */
6696 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006697 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006699 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006700 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006701 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006702
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 e = p + PyUnicode_GET_SIZE(self);
6704 cased = 0;
6705 for (; p < e; p++) {
6706 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006707
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006709 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 else if (!cased && Py_UNICODE_ISLOWER(ch))
6711 cased = 1;
6712 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006713 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006716PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006717"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006719Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721
6722static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006723unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
6725 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6726 register const Py_UNICODE *e;
6727 int cased;
6728
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 /* Shortcut for single character strings */
6730 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006731 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006733 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006734 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006735 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006736
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 e = p + PyUnicode_GET_SIZE(self);
6738 cased = 0;
6739 for (; p < e; p++) {
6740 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006741
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 else if (!cased && Py_UNICODE_ISUPPER(ch))
6745 cased = 1;
6746 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006747 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748}
6749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006750PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006751"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006753Return True if S is a titlecased string and there is at least one\n\
6754character in S, i.e. upper- and titlecase characters may only\n\
6755follow uncased characters and lowercase characters only cased ones.\n\
6756Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757
6758static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006759unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760{
6761 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6762 register const Py_UNICODE *e;
6763 int cased, previous_is_cased;
6764
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 /* Shortcut for single character strings */
6766 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6768 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006770 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006771 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006772 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006773
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 e = p + PyUnicode_GET_SIZE(self);
6775 cased = 0;
6776 previous_is_cased = 0;
6777 for (; p < e; p++) {
6778 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006779
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6781 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006782 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 previous_is_cased = 1;
6784 cased = 1;
6785 }
6786 else if (Py_UNICODE_ISLOWER(ch)) {
6787 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006788 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 previous_is_cased = 1;
6790 cased = 1;
6791 }
6792 else
6793 previous_is_cased = 0;
6794 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796}
6797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006799"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006801Return True if all characters in S are whitespace\n\
6802and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
6804static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006805unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806{
6807 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6808 register const Py_UNICODE *e;
6809
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 /* Shortcut for single character strings */
6811 if (PyUnicode_GET_SIZE(self) == 1 &&
6812 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006813 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006815 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006816 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006817 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006818
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 e = p + PyUnicode_GET_SIZE(self);
6820 for (; p < e; p++) {
6821 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006824 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825}
6826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006828"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006829\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006830Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006831and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006832
6833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006834unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006835{
6836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6837 register const Py_UNICODE *e;
6838
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006839 /* Shortcut for single character strings */
6840 if (PyUnicode_GET_SIZE(self) == 1 &&
6841 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006843
6844 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006845 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006846 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006847
6848 e = p + PyUnicode_GET_SIZE(self);
6849 for (; p < e; p++) {
6850 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006851 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006852 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006853 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006854}
6855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006856PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006857"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006858\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006859Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006860and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006861
6862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006863unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006864{
6865 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6866 register const Py_UNICODE *e;
6867
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006868 /* Shortcut for single character strings */
6869 if (PyUnicode_GET_SIZE(self) == 1 &&
6870 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006871 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006872
6873 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006874 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006875 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006876
6877 e = p + PyUnicode_GET_SIZE(self);
6878 for (; p < e; p++) {
6879 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006880 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006881 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006883}
6884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006885PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006886"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006888Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006889False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890
6891static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006892unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893{
6894 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6895 register const Py_UNICODE *e;
6896
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 /* Shortcut for single character strings */
6898 if (PyUnicode_GET_SIZE(self) == 1 &&
6899 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006902 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006903 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006905
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 e = p + PyUnicode_GET_SIZE(self);
6907 for (; p < e; p++) {
6908 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006909 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006911 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912}
6913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006914PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006915"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006917Return True if all characters in S are digits\n\
6918and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919
6920static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006921unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922{
6923 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6924 register const Py_UNICODE *e;
6925
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 /* Shortcut for single character strings */
6927 if (PyUnicode_GET_SIZE(self) == 1 &&
6928 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006929 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006931 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006932 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006933 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006934
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 e = p + PyUnicode_GET_SIZE(self);
6936 for (; p < e; p++) {
6937 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006940 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941}
6942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006943PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006946Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006947False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948
6949static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006950unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951{
6952 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6953 register const Py_UNICODE *e;
6954
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 /* Shortcut for single character strings */
6956 if (PyUnicode_GET_SIZE(self) == 1 &&
6957 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006958 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006960 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006961 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006963
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 e = p + PyUnicode_GET_SIZE(self);
6965 for (; p < e; p++) {
6966 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006969 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970}
6971
Martin v. Löwis47383402007-08-15 07:32:56 +00006972int
6973PyUnicode_IsIdentifier(PyObject *self)
6974{
6975 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6976 register const Py_UNICODE *e;
6977
6978 /* Special case for empty strings */
6979 if (PyUnicode_GET_SIZE(self) == 0)
6980 return 0;
6981
6982 /* PEP 3131 says that the first character must be in
6983 XID_Start and subsequent characters in XID_Continue,
6984 and for the ASCII range, the 2.x rules apply (i.e
6985 start with letters and underscore, continue with
6986 letters, digits, underscore). However, given the current
6987 definition of XID_Start and XID_Continue, it is sufficient
6988 to check just for these, except that _ must be allowed
6989 as starting an identifier. */
6990 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6991 return 0;
6992
6993 e = p + PyUnicode_GET_SIZE(self);
6994 for (p++; p < e; p++) {
6995 if (!_PyUnicode_IsXidContinue(*p))
6996 return 0;
6997 }
6998 return 1;
6999}
7000
7001PyDoc_STRVAR(isidentifier__doc__,
7002"S.isidentifier() -> bool\n\
7003\n\
7004Return True if S is a valid identifier according\n\
7005to the language definition.");
7006
7007static PyObject*
7008unicode_isidentifier(PyObject *self)
7009{
7010 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7011}
7012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007013PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014"S.join(sequence) -> unicode\n\
7015\n\
7016Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007017sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018
7019static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007020unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007022 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023}
7024
Martin v. Löwis18e16552006-02-15 17:27:45 +00007025static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026unicode_length(PyUnicodeObject *self)
7027{
7028 return self->length;
7029}
7030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007031PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007032"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033\n\
7034Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007035done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036
7037static PyObject *
7038unicode_ljust(PyUnicodeObject *self, PyObject *args)
7039{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007040 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007041 Py_UNICODE fillchar = ' ';
7042
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007043 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 return NULL;
7045
Tim Peters7a29bd52001-09-12 03:03:31 +00007046 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 Py_INCREF(self);
7048 return (PyObject*) self;
7049 }
7050
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007051 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052}
7053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007054PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055"S.lower() -> unicode\n\
7056\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007057Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058
7059static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007060unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 return fixup(self, fixlower);
7063}
7064
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007065#define LEFTSTRIP 0
7066#define RIGHTSTRIP 1
7067#define BOTHSTRIP 2
7068
7069/* Arrays indexed by above */
7070static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7071
7072#define STRIPNAME(i) (stripformat[i]+3)
7073
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007074/* externally visible for str.strip(unicode) */
7075PyObject *
7076_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7077{
7078 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007079 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007080 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007081 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7082 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083
Thomas Wouters477c8d52006-05-27 19:21:47 +00007084 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7085
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086 i = 0;
7087 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007088 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7089 i++;
7090 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007091 }
7092
7093 j = len;
7094 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007095 do {
7096 j--;
7097 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7098 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007099 }
7100
7101 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007102 Py_INCREF(self);
7103 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007104 }
7105 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007106 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007107}
7108
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109
7110static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007113 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007114 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007115
7116 i = 0;
7117 if (striptype != RIGHTSTRIP) {
7118 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7119 i++;
7120 }
7121 }
7122
7123 j = len;
7124 if (striptype != LEFTSTRIP) {
7125 do {
7126 j--;
7127 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7128 j++;
7129 }
7130
7131 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7132 Py_INCREF(self);
7133 return (PyObject*)self;
7134 }
7135 else
7136 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137}
7138
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007139
7140static PyObject *
7141do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7142{
7143 PyObject *sep = NULL;
7144
7145 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7146 return NULL;
7147
7148 if (sep != NULL && sep != Py_None) {
7149 if (PyUnicode_Check(sep))
7150 return _PyUnicode_XStrip(self, striptype, sep);
7151 else if (PyString_Check(sep)) {
7152 PyObject *res;
7153 sep = PyUnicode_FromObject(sep);
7154 if (sep==NULL)
7155 return NULL;
7156 res = _PyUnicode_XStrip(self, striptype, sep);
7157 Py_DECREF(sep);
7158 return res;
7159 }
7160 else {
7161 PyErr_Format(PyExc_TypeError,
7162 "%s arg must be None, unicode or str",
7163 STRIPNAME(striptype));
7164 return NULL;
7165 }
7166 }
7167
7168 return do_strip(self, striptype);
7169}
7170
7171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007172PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007173"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007174\n\
7175Return a copy of the string S with leading and trailing\n\
7176whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007177If chars is given and not None, remove characters in chars instead.\n\
7178If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007179
7180static PyObject *
7181unicode_strip(PyUnicodeObject *self, PyObject *args)
7182{
7183 if (PyTuple_GET_SIZE(args) == 0)
7184 return do_strip(self, BOTHSTRIP); /* Common case */
7185 else
7186 return do_argstrip(self, BOTHSTRIP, args);
7187}
7188
7189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007191"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007192\n\
7193Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007194If chars is given and not None, remove characters in chars instead.\n\
7195If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007196
7197static PyObject *
7198unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7199{
7200 if (PyTuple_GET_SIZE(args) == 0)
7201 return do_strip(self, LEFTSTRIP); /* Common case */
7202 else
7203 return do_argstrip(self, LEFTSTRIP, args);
7204}
7205
7206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007207PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007208"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007209\n\
7210Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007211If chars is given and not None, remove characters in chars instead.\n\
7212If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007213
7214static PyObject *
7215unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7216{
7217 if (PyTuple_GET_SIZE(args) == 0)
7218 return do_strip(self, RIGHTSTRIP); /* Common case */
7219 else
7220 return do_argstrip(self, RIGHTSTRIP, args);
7221}
7222
7223
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007225unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226{
7227 PyUnicodeObject *u;
7228 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007229 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007230 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231
7232 if (len < 0)
7233 len = 0;
7234
Tim Peters7a29bd52001-09-12 03:03:31 +00007235 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236 /* no repeat, return original string */
7237 Py_INCREF(str);
7238 return (PyObject*) str;
7239 }
Tim Peters8f422462000-09-09 06:13:41 +00007240
7241 /* ensure # of chars needed doesn't overflow int and # of bytes
7242 * needed doesn't overflow size_t
7243 */
7244 nchars = len * str->length;
7245 if (len && nchars / len != str->length) {
7246 PyErr_SetString(PyExc_OverflowError,
7247 "repeated string is too long");
7248 return NULL;
7249 }
7250 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7251 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7252 PyErr_SetString(PyExc_OverflowError,
7253 "repeated string is too long");
7254 return NULL;
7255 }
7256 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 if (!u)
7258 return NULL;
7259
7260 p = u->str;
7261
Thomas Wouters477c8d52006-05-27 19:21:47 +00007262 if (str->length == 1 && len > 0) {
7263 Py_UNICODE_FILL(p, str->str[0], len);
7264 } else {
7265 Py_ssize_t done = 0; /* number of characters copied this far */
7266 if (done < nchars) {
7267 Py_UNICODE_COPY(p, str->str, str->length);
7268 done = str->length;
7269 }
7270 while (done < nchars) {
7271 int n = (done <= nchars-done) ? done : nchars-done;
7272 Py_UNICODE_COPY(p+done, p, n);
7273 done += n;
7274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 }
7276
7277 return (PyObject*) u;
7278}
7279
7280PyObject *PyUnicode_Replace(PyObject *obj,
7281 PyObject *subobj,
7282 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007283 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284{
7285 PyObject *self;
7286 PyObject *str1;
7287 PyObject *str2;
7288 PyObject *result;
7289
7290 self = PyUnicode_FromObject(obj);
7291 if (self == NULL)
7292 return NULL;
7293 str1 = PyUnicode_FromObject(subobj);
7294 if (str1 == NULL) {
7295 Py_DECREF(self);
7296 return NULL;
7297 }
7298 str2 = PyUnicode_FromObject(replobj);
7299 if (str2 == NULL) {
7300 Py_DECREF(self);
7301 Py_DECREF(str1);
7302 return NULL;
7303 }
Tim Petersced69f82003-09-16 20:30:58 +00007304 result = replace((PyUnicodeObject *)self,
7305 (PyUnicodeObject *)str1,
7306 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 maxcount);
7308 Py_DECREF(self);
7309 Py_DECREF(str1);
7310 Py_DECREF(str2);
7311 return result;
7312}
7313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007314PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315"S.replace (old, new[, maxsplit]) -> unicode\n\
7316\n\
7317Return a copy of S with all occurrences of substring\n\
7318old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007319given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320
7321static PyObject*
7322unicode_replace(PyUnicodeObject *self, PyObject *args)
7323{
7324 PyUnicodeObject *str1;
7325 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007326 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327 PyObject *result;
7328
Martin v. Löwis18e16552006-02-15 17:27:45 +00007329 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 return NULL;
7331 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7332 if (str1 == NULL)
7333 return NULL;
7334 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007335 if (str2 == NULL) {
7336 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339
7340 result = replace(self, str1, str2, maxcount);
7341
7342 Py_DECREF(str1);
7343 Py_DECREF(str2);
7344 return result;
7345}
7346
7347static
7348PyObject *unicode_repr(PyObject *unicode)
7349{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007350 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007351 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007352 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7353 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7354
7355 /* XXX(nnorwitz): rather than over-allocating, it would be
7356 better to choose a different scheme. Perhaps scan the
7357 first N-chars of the string and allocate based on that size.
7358 */
7359 /* Initial allocation is based on the longest-possible unichr
7360 escape.
7361
7362 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7363 unichr, so in this case it's the longest unichr escape. In
7364 narrow (UTF-16) builds this is five chars per source unichr
7365 since there are two unichrs in the surrogate pair, so in narrow
7366 (UTF-16) builds it's not the longest unichr escape.
7367
7368 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7369 so in the narrow (UTF-16) build case it's the longest unichr
7370 escape.
7371 */
7372
Walter Dörwald1ab83302007-05-18 17:15:44 +00007373 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007374 2 /* quotes */
7375#ifdef Py_UNICODE_WIDE
7376 + 10*size
7377#else
7378 + 6*size
7379#endif
7380 + 1);
7381 if (repr == NULL)
7382 return NULL;
7383
Walter Dörwald1ab83302007-05-18 17:15:44 +00007384 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007385
7386 /* Add quote */
7387 *p++ = (findchar(s, size, '\'') &&
7388 !findchar(s, size, '"')) ? '"' : '\'';
7389 while (size-- > 0) {
7390 Py_UNICODE ch = *s++;
7391
7392 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007393 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007394 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007395 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007396 continue;
7397 }
7398
7399#ifdef Py_UNICODE_WIDE
7400 /* Map 21-bit characters to '\U00xxxxxx' */
7401 else if (ch >= 0x10000) {
7402 *p++ = '\\';
7403 *p++ = 'U';
7404 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7405 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7406 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7407 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7408 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7409 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7410 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7411 *p++ = hexdigits[ch & 0x0000000F];
7412 continue;
7413 }
7414#else
7415 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7416 else if (ch >= 0xD800 && ch < 0xDC00) {
7417 Py_UNICODE ch2;
7418 Py_UCS4 ucs;
7419
7420 ch2 = *s++;
7421 size--;
7422 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7423 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7424 *p++ = '\\';
7425 *p++ = 'U';
7426 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7427 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7428 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7429 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7430 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7431 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7432 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7433 *p++ = hexdigits[ucs & 0x0000000F];
7434 continue;
7435 }
7436 /* Fall through: isolated surrogates are copied as-is */
7437 s--;
7438 size++;
7439 }
7440#endif
7441
7442 /* Map 16-bit characters to '\uxxxx' */
7443 if (ch >= 256) {
7444 *p++ = '\\';
7445 *p++ = 'u';
7446 *p++ = hexdigits[(ch >> 12) & 0x000F];
7447 *p++ = hexdigits[(ch >> 8) & 0x000F];
7448 *p++ = hexdigits[(ch >> 4) & 0x000F];
7449 *p++ = hexdigits[ch & 0x000F];
7450 }
7451
7452 /* Map special whitespace to '\t', \n', '\r' */
7453 else if (ch == '\t') {
7454 *p++ = '\\';
7455 *p++ = 't';
7456 }
7457 else if (ch == '\n') {
7458 *p++ = '\\';
7459 *p++ = 'n';
7460 }
7461 else if (ch == '\r') {
7462 *p++ = '\\';
7463 *p++ = 'r';
7464 }
7465
7466 /* Map non-printable US ASCII to '\xhh' */
7467 else if (ch < ' ' || ch >= 0x7F) {
7468 *p++ = '\\';
7469 *p++ = 'x';
7470 *p++ = hexdigits[(ch >> 4) & 0x000F];
7471 *p++ = hexdigits[ch & 0x000F];
7472 }
7473
7474 /* Copy everything else as-is */
7475 else
7476 *p++ = (char) ch;
7477 }
7478 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007479 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007480
7481 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007482 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007483 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484}
7485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007486PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487"S.rfind(sub [,start [,end]]) -> int\n\
7488\n\
7489Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007490such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491arguments start and end are interpreted as in slice notation.\n\
7492\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007493Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494
7495static PyObject *
7496unicode_rfind(PyUnicodeObject *self, PyObject *args)
7497{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007498 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007499 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007500 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007501 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502
Guido van Rossumb8872e62000-05-09 14:14:27 +00007503 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7504 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007506 substring = PyUnicode_FromObject(substring);
7507 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 return NULL;
7509
Thomas Wouters477c8d52006-05-27 19:21:47 +00007510 result = stringlib_rfind_slice(
7511 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7512 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7513 start, end
7514 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
7516 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007517
7518 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519}
7520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007521PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522"S.rindex(sub [,start [,end]]) -> int\n\
7523\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007524Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525
7526static PyObject *
7527unicode_rindex(PyUnicodeObject *self, PyObject *args)
7528{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007529 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007530 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007531 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007532 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
Guido van Rossumb8872e62000-05-09 14:14:27 +00007534 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7535 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007537 substring = PyUnicode_FromObject(substring);
7538 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 return NULL;
7540
Thomas Wouters477c8d52006-05-27 19:21:47 +00007541 result = stringlib_rfind_slice(
7542 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7543 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7544 start, end
7545 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
7547 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007548
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 if (result < 0) {
7550 PyErr_SetString(PyExc_ValueError, "substring not found");
7551 return NULL;
7552 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007553 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554}
7555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007556PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007557"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558\n\
7559Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007560done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
7562static PyObject *
7563unicode_rjust(PyUnicodeObject *self, PyObject *args)
7564{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007565 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007566 Py_UNICODE fillchar = ' ';
7567
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007568 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 return NULL;
7570
Tim Peters7a29bd52001-09-12 03:03:31 +00007571 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 Py_INCREF(self);
7573 return (PyObject*) self;
7574 }
7575
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007576 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577}
7578
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579PyObject *PyUnicode_Split(PyObject *s,
7580 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007581 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582{
7583 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007584
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 s = PyUnicode_FromObject(s);
7586 if (s == NULL)
7587 return NULL;
7588 if (sep != NULL) {
7589 sep = PyUnicode_FromObject(sep);
7590 if (sep == NULL) {
7591 Py_DECREF(s);
7592 return NULL;
7593 }
7594 }
7595
7596 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7597
7598 Py_DECREF(s);
7599 Py_XDECREF(sep);
7600 return result;
7601}
7602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007603PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604"S.split([sep [,maxsplit]]) -> list of strings\n\
7605\n\
7606Return a list of the words in S, using sep as the\n\
7607delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007608splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007609any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610
7611static PyObject*
7612unicode_split(PyUnicodeObject *self, PyObject *args)
7613{
7614 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007615 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616
Martin v. Löwis18e16552006-02-15 17:27:45 +00007617 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618 return NULL;
7619
7620 if (substring == Py_None)
7621 return split(self, NULL, maxcount);
7622 else if (PyUnicode_Check(substring))
7623 return split(self, (PyUnicodeObject *)substring, maxcount);
7624 else
7625 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7626}
7627
Thomas Wouters477c8d52006-05-27 19:21:47 +00007628PyObject *
7629PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7630{
7631 PyObject* str_obj;
7632 PyObject* sep_obj;
7633 PyObject* out;
7634
7635 str_obj = PyUnicode_FromObject(str_in);
7636 if (!str_obj)
7637 return NULL;
7638 sep_obj = PyUnicode_FromObject(sep_in);
7639 if (!sep_obj) {
7640 Py_DECREF(str_obj);
7641 return NULL;
7642 }
7643
7644 out = stringlib_partition(
7645 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7646 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7647 );
7648
7649 Py_DECREF(sep_obj);
7650 Py_DECREF(str_obj);
7651
7652 return out;
7653}
7654
7655
7656PyObject *
7657PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7658{
7659 PyObject* str_obj;
7660 PyObject* sep_obj;
7661 PyObject* out;
7662
7663 str_obj = PyUnicode_FromObject(str_in);
7664 if (!str_obj)
7665 return NULL;
7666 sep_obj = PyUnicode_FromObject(sep_in);
7667 if (!sep_obj) {
7668 Py_DECREF(str_obj);
7669 return NULL;
7670 }
7671
7672 out = stringlib_rpartition(
7673 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7674 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7675 );
7676
7677 Py_DECREF(sep_obj);
7678 Py_DECREF(str_obj);
7679
7680 return out;
7681}
7682
7683PyDoc_STRVAR(partition__doc__,
7684"S.partition(sep) -> (head, sep, tail)\n\
7685\n\
7686Searches for the separator sep in S, and returns the part before it,\n\
7687the separator itself, and the part after it. If the separator is not\n\
7688found, returns S and two empty strings.");
7689
7690static PyObject*
7691unicode_partition(PyUnicodeObject *self, PyObject *separator)
7692{
7693 return PyUnicode_Partition((PyObject *)self, separator);
7694}
7695
7696PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007697"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007698\n\
7699Searches for the separator sep in S, starting at the end of S, and returns\n\
7700the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007701separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007702
7703static PyObject*
7704unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7705{
7706 return PyUnicode_RPartition((PyObject *)self, separator);
7707}
7708
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007709PyObject *PyUnicode_RSplit(PyObject *s,
7710 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007711 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007712{
7713 PyObject *result;
7714
7715 s = PyUnicode_FromObject(s);
7716 if (s == NULL)
7717 return NULL;
7718 if (sep != NULL) {
7719 sep = PyUnicode_FromObject(sep);
7720 if (sep == NULL) {
7721 Py_DECREF(s);
7722 return NULL;
7723 }
7724 }
7725
7726 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7727
7728 Py_DECREF(s);
7729 Py_XDECREF(sep);
7730 return result;
7731}
7732
7733PyDoc_STRVAR(rsplit__doc__,
7734"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7735\n\
7736Return a list of the words in S, using sep as the\n\
7737delimiter string, starting at the end of the string and\n\
7738working to the front. If maxsplit is given, at most maxsplit\n\
7739splits are done. If sep is not specified, any whitespace string\n\
7740is a separator.");
7741
7742static PyObject*
7743unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7744{
7745 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007746 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007747
Martin v. Löwis18e16552006-02-15 17:27:45 +00007748 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007749 return NULL;
7750
7751 if (substring == Py_None)
7752 return rsplit(self, NULL, maxcount);
7753 else if (PyUnicode_Check(substring))
7754 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7755 else
7756 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7757}
7758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007759PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007760"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761\n\
7762Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007763Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007764is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
7766static PyObject*
7767unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7768{
Guido van Rossum86662912000-04-11 15:38:46 +00007769 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770
Guido van Rossum86662912000-04-11 15:38:46 +00007771 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 return NULL;
7773
Guido van Rossum86662912000-04-11 15:38:46 +00007774 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775}
7776
7777static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007778PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779{
Walter Dörwald346737f2007-05-31 10:44:43 +00007780 if (PyUnicode_CheckExact(self)) {
7781 Py_INCREF(self);
7782 return self;
7783 } else
7784 /* Subtype -- return genuine unicode string with the same value. */
7785 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7786 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787}
7788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007789PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790"S.swapcase() -> unicode\n\
7791\n\
7792Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007793and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794
7795static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007796unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 return fixup(self, fixswapcase);
7799}
7800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007801PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802"S.translate(table) -> unicode\n\
7803\n\
7804Return a copy of the string S, where all characters have been mapped\n\
7805through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007806Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7807Unmapped characters are left untouched. Characters mapped to None\n\
7808are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809
7810static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007811unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812{
Georg Brandl94c2c752007-10-23 06:52:59 +00007813 PyObject *newtable = NULL;
7814 Py_ssize_t i = 0;
7815 PyObject *key, *value, *result;
7816
7817 if (!PyDict_Check(table)) {
7818 PyErr_SetString(PyExc_TypeError, "translate argument must be a dict");
7819 return NULL;
7820 }
7821 /* fixup the table -- allow size-1 string keys instead of only int keys */
7822 newtable = PyDict_Copy(table);
7823 if (!newtable) return NULL;
7824 while (PyDict_Next(table, &i, &key, &value)) {
7825 if (PyUnicode_Check(key)) {
7826 /* convert string keys to integer keys */
7827 PyObject *newkey;
7828 int res;
7829 if (PyUnicode_GET_SIZE(key) != 1) {
7830 PyErr_SetString(PyExc_ValueError, "string items in translate "
7831 "table must be 1 element long");
7832 goto err;
7833 }
7834 newkey = PyInt_FromLong(PyUnicode_AS_UNICODE(key)[0]);
7835 if (!newkey)
7836 goto err;
7837 res = PyDict_SetItem(newtable, newkey, value);
7838 Py_DECREF(newkey);
7839 if (res < 0)
7840 goto err;
7841 } else if (PyInt_Check(key)) {
7842 /* just keep integer keys */
7843 if (PyDict_SetItem(newtable, key, value) < 0)
7844 goto err;
7845 } else {
7846 PyErr_SetString(PyExc_TypeError, "items in translate table must be "
7847 "strings or integers");
7848 goto err;
7849 }
7850 }
7851
7852 result = PyUnicode_TranslateCharmap(self->str,
7853 self->length,
7854 newtable,
7855 "ignore");
7856 Py_DECREF(newtable);
7857 return result;
7858 err:
7859 Py_DECREF(newtable);
7860 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861}
7862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007863PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864"S.upper() -> unicode\n\
7865\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007866Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867
7868static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007869unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 return fixup(self, fixupper);
7872}
7873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007874PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875"S.zfill(width) -> unicode\n\
7876\n\
7877Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007878of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879
7880static PyObject *
7881unicode_zfill(PyUnicodeObject *self, PyObject *args)
7882{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007883 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 PyUnicodeObject *u;
7885
Martin v. Löwis18e16552006-02-15 17:27:45 +00007886 Py_ssize_t width;
7887 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 return NULL;
7889
7890 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007891 if (PyUnicode_CheckExact(self)) {
7892 Py_INCREF(self);
7893 return (PyObject*) self;
7894 }
7895 else
7896 return PyUnicode_FromUnicode(
7897 PyUnicode_AS_UNICODE(self),
7898 PyUnicode_GET_SIZE(self)
7899 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 }
7901
7902 fill = width - self->length;
7903
7904 u = pad(self, fill, 0, '0');
7905
Walter Dörwald068325e2002-04-15 13:36:47 +00007906 if (u == NULL)
7907 return NULL;
7908
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 if (u->str[fill] == '+' || u->str[fill] == '-') {
7910 /* move sign to beginning of string */
7911 u->str[0] = u->str[fill];
7912 u->str[fill] = '0';
7913 }
7914
7915 return (PyObject*) u;
7916}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917
7918#if 0
7919static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007920unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922 return PyInt_FromLong(unicode_freelist_size);
7923}
7924#endif
7925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007926PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007927"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007929Return True if S starts with the specified prefix, False otherwise.\n\
7930With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007931With optional end, stop comparing S at that position.\n\
7932prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933
7934static PyObject *
7935unicode_startswith(PyUnicodeObject *self,
7936 PyObject *args)
7937{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007938 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007940 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007941 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007942 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007944 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007945 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007947 if (PyTuple_Check(subobj)) {
7948 Py_ssize_t i;
7949 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7950 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7951 PyTuple_GET_ITEM(subobj, i));
7952 if (substring == NULL)
7953 return NULL;
7954 result = tailmatch(self, substring, start, end, -1);
7955 Py_DECREF(substring);
7956 if (result) {
7957 Py_RETURN_TRUE;
7958 }
7959 }
7960 /* nothing matched */
7961 Py_RETURN_FALSE;
7962 }
7963 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007965 return NULL;
7966 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007968 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969}
7970
7971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007972PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007973"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007975Return True if S ends with the specified suffix, False otherwise.\n\
7976With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007977With optional end, stop comparing S at that position.\n\
7978suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979
7980static PyObject *
7981unicode_endswith(PyUnicodeObject *self,
7982 PyObject *args)
7983{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007984 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007986 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007987 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007988 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007990 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7991 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007993 if (PyTuple_Check(subobj)) {
7994 Py_ssize_t i;
7995 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7996 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7997 PyTuple_GET_ITEM(subobj, i));
7998 if (substring == NULL)
7999 return NULL;
8000 result = tailmatch(self, substring, start, end, +1);
8001 Py_DECREF(substring);
8002 if (result) {
8003 Py_RETURN_TRUE;
8004 }
8005 }
8006 Py_RETURN_FALSE;
8007 }
8008 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008012 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008014 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015}
8016
Eric Smith8c663262007-08-25 02:26:07 +00008017#include "stringlib/string_format.h"
8018
8019PyDoc_STRVAR(format__doc__,
8020"S.format(*args, **kwargs) -> unicode\n\
8021\n\
8022");
8023
Eric Smith8c663262007-08-25 02:26:07 +00008024PyDoc_STRVAR(p_format__doc__,
8025"S.__format__(format_spec) -> unicode\n\
8026\n\
8027");
8028
8029static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008030unicode_getnewargs(PyUnicodeObject *v)
8031{
8032 return Py_BuildValue("(u#)", v->str, v->length);
8033}
8034
8035
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036static PyMethodDef unicode_methods[] = {
8037
8038 /* Order is according to common usage: often used methods should
8039 appear first, since lookup is done sequentially. */
8040
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008041 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8042 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8043 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008044 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008045 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8046 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8047 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8048 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8049 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8050 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8051 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008052 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008053 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8054 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8055 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008056 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008057 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8058 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8059 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008060 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008061 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008062 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008063 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008064 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8065 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8066 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8067 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8068 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8069 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8070 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8071 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8072 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8073 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8074 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8075 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8076 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8077 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008078 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008079 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008080 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8081 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008082 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8083 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008084#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008085 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086#endif
8087
8088#if 0
8089 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008090 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091#endif
8092
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008093 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 {NULL, NULL}
8095};
8096
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008097static PyObject *
8098unicode_mod(PyObject *v, PyObject *w)
8099{
8100 if (!PyUnicode_Check(v)) {
8101 Py_INCREF(Py_NotImplemented);
8102 return Py_NotImplemented;
8103 }
8104 return PyUnicode_Format(v, w);
8105}
8106
8107static PyNumberMethods unicode_as_number = {
8108 0, /*nb_add*/
8109 0, /*nb_subtract*/
8110 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008111 unicode_mod, /*nb_remainder*/
8112};
8113
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008115 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008116 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008117 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8118 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008119 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 0, /* sq_ass_item */
8121 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008122 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123};
8124
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008125static PyObject*
8126unicode_subscript(PyUnicodeObject* self, PyObject* item)
8127{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008128 if (PyIndex_Check(item)) {
8129 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008130 if (i == -1 && PyErr_Occurred())
8131 return NULL;
8132 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008133 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008134 return unicode_getitem(self, i);
8135 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008136 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008137 Py_UNICODE* source_buf;
8138 Py_UNICODE* result_buf;
8139 PyObject* result;
8140
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008141 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008142 &start, &stop, &step, &slicelength) < 0) {
8143 return NULL;
8144 }
8145
8146 if (slicelength <= 0) {
8147 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008148 } else if (start == 0 && step == 1 && slicelength == self->length &&
8149 PyUnicode_CheckExact(self)) {
8150 Py_INCREF(self);
8151 return (PyObject *)self;
8152 } else if (step == 1) {
8153 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008154 } else {
8155 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008156 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8157 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008158
8159 if (result_buf == NULL)
8160 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008161
8162 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8163 result_buf[i] = source_buf[cur];
8164 }
Tim Petersced69f82003-09-16 20:30:58 +00008165
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008166 result = PyUnicode_FromUnicode(result_buf, slicelength);
8167 PyMem_FREE(result_buf);
8168 return result;
8169 }
8170 } else {
8171 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8172 return NULL;
8173 }
8174}
8175
8176static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008177 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008178 (binaryfunc)unicode_subscript, /* mp_subscript */
8179 (objobjargproc)0, /* mp_ass_subscript */
8180};
8181
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183/* Helpers for PyUnicode_Format() */
8184
8185static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008186getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008188 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189 if (argidx < arglen) {
8190 (*p_argidx)++;
8191 if (arglen < 0)
8192 return args;
8193 else
8194 return PyTuple_GetItem(args, argidx);
8195 }
8196 PyErr_SetString(PyExc_TypeError,
8197 "not enough arguments for format string");
8198 return NULL;
8199}
8200
8201#define F_LJUST (1<<0)
8202#define F_SIGN (1<<1)
8203#define F_BLANK (1<<2)
8204#define F_ALT (1<<3)
8205#define F_ZERO (1<<4)
8206
Martin v. Löwis18e16552006-02-15 17:27:45 +00008207static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008208strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008210 register Py_ssize_t i;
8211 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 for (i = len - 1; i >= 0; i--)
8213 buffer[i] = (Py_UNICODE) charbuffer[i];
8214
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 return len;
8216}
8217
Neal Norwitzfc76d632006-01-10 06:03:13 +00008218static int
8219doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8220{
Tim Peters15231542006-02-16 01:08:01 +00008221 Py_ssize_t result;
8222
Neal Norwitzfc76d632006-01-10 06:03:13 +00008223 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008224 result = strtounicode(buffer, (char *)buffer);
8225 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008226}
8227
8228static int
8229longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8230{
Tim Peters15231542006-02-16 01:08:01 +00008231 Py_ssize_t result;
8232
Neal Norwitzfc76d632006-01-10 06:03:13 +00008233 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008234 result = strtounicode(buffer, (char *)buffer);
8235 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008236}
8237
Guido van Rossum078151d2002-08-11 04:24:12 +00008238/* XXX To save some code duplication, formatfloat/long/int could have been
8239 shared with stringobject.c, converting from 8-bit to Unicode after the
8240 formatting is done. */
8241
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242static int
8243formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008244 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 int flags,
8246 int prec,
8247 int type,
8248 PyObject *v)
8249{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008250 /* fmt = '%#.' + `prec` + `type`
8251 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 char fmt[20];
8253 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008254
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 x = PyFloat_AsDouble(v);
8256 if (x == -1.0 && PyErr_Occurred())
8257 return -1;
8258 if (prec < 0)
8259 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8261 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008262 /* Worst case length calc to ensure no buffer overrun:
8263
8264 'g' formats:
8265 fmt = %#.<prec>g
8266 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8267 for any double rep.)
8268 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8269
8270 'f' formats:
8271 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8272 len = 1 + 50 + 1 + prec = 52 + prec
8273
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008274 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008275 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008276
8277 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008278 if (((type == 'g' || type == 'G') &&
8279 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008280 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008281 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008282 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008283 return -1;
8284 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008285 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8286 (flags&F_ALT) ? "#" : "",
8287 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008288 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289}
8290
Tim Peters38fd5b62000-09-21 05:43:11 +00008291static PyObject*
8292formatlong(PyObject *val, int flags, int prec, int type)
8293{
8294 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008295 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008296 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008297 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008298
8299 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8300 if (!str)
8301 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008302 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008303 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008304 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008305}
8306
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307static int
8308formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008309 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310 int flags,
8311 int prec,
8312 int type,
8313 PyObject *v)
8314{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008315 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008316 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8317 * + 1 + 1
8318 * = 24
8319 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008320 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008321 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 long x;
8323
8324 x = PyInt_AsLong(v);
8325 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008326 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008327 if (x < 0 && type == 'u') {
8328 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008329 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008330 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8331 sign = "-";
8332 else
8333 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008335 prec = 1;
8336
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008337 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8338 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008339 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008340 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008341 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008342 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008343 return -1;
8344 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008345
8346 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008347 (type == 'x' || type == 'X' || type == 'o')) {
8348 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008349 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008350 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008351 * - when 0 is being converted, the C standard leaves off
8352 * the '0x' or '0X', which is inconsistent with other
8353 * %#x/%#X conversions and inconsistent with Python's
8354 * hex() function
8355 * - there are platforms that violate the standard and
8356 * convert 0 with the '0x' or '0X'
8357 * (Metrowerks, Compaq Tru64)
8358 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008359 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008360 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008361 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008362 * We can achieve the desired consistency by inserting our
8363 * own '0x' or '0X' prefix, and substituting %x/%X in place
8364 * of %#x/%#X.
8365 *
8366 * Note that this is the same approach as used in
8367 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008368 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008369 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8370 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008371 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008372 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008373 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8374 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008375 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008376 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008377 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008378 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008379 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008380 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381}
8382
8383static int
8384formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008385 size_t buflen,
8386 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008388 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008389 if (PyUnicode_Check(v)) {
8390 if (PyUnicode_GET_SIZE(v) != 1)
8391 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008393 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008395 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008396 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008397 goto onError;
8398 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400
8401 else {
8402 /* Integer input truncated to a character */
8403 long x;
8404 x = PyInt_AsLong(v);
8405 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008406 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008407#ifdef Py_UNICODE_WIDE
8408 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008409 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008410 "%c arg not in range(0x110000) "
8411 "(wide Python build)");
8412 return -1;
8413 }
8414#else
8415 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008416 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008417 "%c arg not in range(0x10000) "
8418 "(narrow Python build)");
8419 return -1;
8420 }
8421#endif
8422 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 }
8424 buf[1] = '\0';
8425 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008426
8427 onError:
8428 PyErr_SetString(PyExc_TypeError,
8429 "%c requires int or char");
8430 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431}
8432
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008433/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8434
8435 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8436 chars are formatted. XXX This is a magic number. Each formatting
8437 routine does bounds checking to ensure no overflow, but a better
8438 solution may be to malloc a buffer of appropriate size for each
8439 format. For now, the current solution is sufficient.
8440*/
8441#define FORMATBUFLEN (size_t)120
8442
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443PyObject *PyUnicode_Format(PyObject *format,
8444 PyObject *args)
8445{
8446 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008447 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 int args_owned = 0;
8449 PyUnicodeObject *result = NULL;
8450 PyObject *dict = NULL;
8451 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008452
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 if (format == NULL || args == NULL) {
8454 PyErr_BadInternalCall();
8455 return NULL;
8456 }
8457 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008458 if (uformat == NULL)
8459 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460 fmt = PyUnicode_AS_UNICODE(uformat);
8461 fmtcnt = PyUnicode_GET_SIZE(uformat);
8462
8463 reslen = rescnt = fmtcnt + 100;
8464 result = _PyUnicode_New(reslen);
8465 if (result == NULL)
8466 goto onError;
8467 res = PyUnicode_AS_UNICODE(result);
8468
8469 if (PyTuple_Check(args)) {
8470 arglen = PyTuple_Size(args);
8471 argidx = 0;
8472 }
8473 else {
8474 arglen = -1;
8475 argidx = -2;
8476 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008477 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008478 !PyString_Check(args) && !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 dict = args;
8480
8481 while (--fmtcnt >= 0) {
8482 if (*fmt != '%') {
8483 if (--rescnt < 0) {
8484 rescnt = fmtcnt + 100;
8485 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008486 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008487 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8489 --rescnt;
8490 }
8491 *res++ = *fmt++;
8492 }
8493 else {
8494 /* Got a format specifier */
8495 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008496 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 Py_UNICODE c = '\0';
8499 Py_UNICODE fill;
8500 PyObject *v = NULL;
8501 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008502 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008504 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008505 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506
8507 fmt++;
8508 if (*fmt == '(') {
8509 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008510 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 PyObject *key;
8512 int pcount = 1;
8513
8514 if (dict == NULL) {
8515 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008516 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 goto onError;
8518 }
8519 ++fmt;
8520 --fmtcnt;
8521 keystart = fmt;
8522 /* Skip over balanced parentheses */
8523 while (pcount > 0 && --fmtcnt >= 0) {
8524 if (*fmt == ')')
8525 --pcount;
8526 else if (*fmt == '(')
8527 ++pcount;
8528 fmt++;
8529 }
8530 keylen = fmt - keystart - 1;
8531 if (fmtcnt < 0 || pcount > 0) {
8532 PyErr_SetString(PyExc_ValueError,
8533 "incomplete format key");
8534 goto onError;
8535 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008536#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008537 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 then looked up since Python uses strings to hold
8539 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008540 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541 key = PyUnicode_EncodeUTF8(keystart,
8542 keylen,
8543 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008544#else
8545 key = PyUnicode_FromUnicode(keystart, keylen);
8546#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 if (key == NULL)
8548 goto onError;
8549 if (args_owned) {
8550 Py_DECREF(args);
8551 args_owned = 0;
8552 }
8553 args = PyObject_GetItem(dict, key);
8554 Py_DECREF(key);
8555 if (args == NULL) {
8556 goto onError;
8557 }
8558 args_owned = 1;
8559 arglen = -1;
8560 argidx = -2;
8561 }
8562 while (--fmtcnt >= 0) {
8563 switch (c = *fmt++) {
8564 case '-': flags |= F_LJUST; continue;
8565 case '+': flags |= F_SIGN; continue;
8566 case ' ': flags |= F_BLANK; continue;
8567 case '#': flags |= F_ALT; continue;
8568 case '0': flags |= F_ZERO; continue;
8569 }
8570 break;
8571 }
8572 if (c == '*') {
8573 v = getnextarg(args, arglen, &argidx);
8574 if (v == NULL)
8575 goto onError;
8576 if (!PyInt_Check(v)) {
8577 PyErr_SetString(PyExc_TypeError,
8578 "* wants int");
8579 goto onError;
8580 }
8581 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008582 if (width == -1 && PyErr_Occurred())
8583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 if (width < 0) {
8585 flags |= F_LJUST;
8586 width = -width;
8587 }
8588 if (--fmtcnt >= 0)
8589 c = *fmt++;
8590 }
8591 else if (c >= '0' && c <= '9') {
8592 width = c - '0';
8593 while (--fmtcnt >= 0) {
8594 c = *fmt++;
8595 if (c < '0' || c > '9')
8596 break;
8597 if ((width*10) / 10 != width) {
8598 PyErr_SetString(PyExc_ValueError,
8599 "width too big");
8600 goto onError;
8601 }
8602 width = width*10 + (c - '0');
8603 }
8604 }
8605 if (c == '.') {
8606 prec = 0;
8607 if (--fmtcnt >= 0)
8608 c = *fmt++;
8609 if (c == '*') {
8610 v = getnextarg(args, arglen, &argidx);
8611 if (v == NULL)
8612 goto onError;
8613 if (!PyInt_Check(v)) {
8614 PyErr_SetString(PyExc_TypeError,
8615 "* wants int");
8616 goto onError;
8617 }
8618 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008619 if (prec == -1 && PyErr_Occurred())
8620 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 if (prec < 0)
8622 prec = 0;
8623 if (--fmtcnt >= 0)
8624 c = *fmt++;
8625 }
8626 else if (c >= '0' && c <= '9') {
8627 prec = c - '0';
8628 while (--fmtcnt >= 0) {
8629 c = Py_CHARMASK(*fmt++);
8630 if (c < '0' || c > '9')
8631 break;
8632 if ((prec*10) / 10 != prec) {
8633 PyErr_SetString(PyExc_ValueError,
8634 "prec too big");
8635 goto onError;
8636 }
8637 prec = prec*10 + (c - '0');
8638 }
8639 }
8640 } /* prec */
8641 if (fmtcnt >= 0) {
8642 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 if (--fmtcnt >= 0)
8644 c = *fmt++;
8645 }
8646 }
8647 if (fmtcnt < 0) {
8648 PyErr_SetString(PyExc_ValueError,
8649 "incomplete format");
8650 goto onError;
8651 }
8652 if (c != '%') {
8653 v = getnextarg(args, arglen, &argidx);
8654 if (v == NULL)
8655 goto onError;
8656 }
8657 sign = 0;
8658 fill = ' ';
8659 switch (c) {
8660
8661 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008662 pbuf = formatbuf;
8663 /* presume that buffer length is at least 1 */
8664 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 len = 1;
8666 break;
8667
8668 case 's':
8669 case 'r':
8670 if (PyUnicode_Check(v) && c == 's') {
8671 temp = v;
8672 Py_INCREF(temp);
8673 }
8674 else {
8675 PyObject *unicode;
8676 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008677 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 else
8679 temp = PyObject_Repr(v);
8680 if (temp == NULL)
8681 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008682 if (PyUnicode_Check(temp))
8683 /* nothing to do */;
8684 else if (PyString_Check(temp)) {
8685 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008686 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008688 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008690 Py_DECREF(temp);
8691 temp = unicode;
8692 if (temp == NULL)
8693 goto onError;
8694 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008695 else {
8696 Py_DECREF(temp);
8697 PyErr_SetString(PyExc_TypeError,
8698 "%s argument has non-string str()");
8699 goto onError;
8700 }
8701 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008702 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703 len = PyUnicode_GET_SIZE(temp);
8704 if (prec >= 0 && len > prec)
8705 len = prec;
8706 break;
8707
8708 case 'i':
8709 case 'd':
8710 case 'u':
8711 case 'o':
8712 case 'x':
8713 case 'X':
8714 if (c == 'i')
8715 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008716 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008717 temp = formatlong(v, flags, prec, c);
8718 if (!temp)
8719 goto onError;
8720 pbuf = PyUnicode_AS_UNICODE(temp);
8721 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008722 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008724 else {
8725 pbuf = formatbuf;
8726 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8727 flags, prec, c, v);
8728 if (len < 0)
8729 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008730 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008731 }
8732 if (flags & F_ZERO)
8733 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 break;
8735
8736 case 'e':
8737 case 'E':
8738 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008739 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740 case 'g':
8741 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008742 if (c == 'F')
8743 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008744 pbuf = formatbuf;
8745 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8746 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 if (len < 0)
8748 goto onError;
8749 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008750 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 fill = '0';
8752 break;
8753
8754 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008755 pbuf = formatbuf;
8756 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 if (len < 0)
8758 goto onError;
8759 break;
8760
8761 default:
8762 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008763 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008764 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008765 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008766 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008767 (Py_ssize_t)(fmt - 1 -
8768 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 goto onError;
8770 }
8771 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008772 if (*pbuf == '-' || *pbuf == '+') {
8773 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 len--;
8775 }
8776 else if (flags & F_SIGN)
8777 sign = '+';
8778 else if (flags & F_BLANK)
8779 sign = ' ';
8780 else
8781 sign = 0;
8782 }
8783 if (width < len)
8784 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008785 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 reslen -= rescnt;
8787 rescnt = width + fmtcnt + 100;
8788 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008789 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008790 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008791 PyErr_NoMemory();
8792 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008793 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008794 if (_PyUnicode_Resize(&result, reslen) < 0) {
8795 Py_XDECREF(temp);
8796 goto onError;
8797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 res = PyUnicode_AS_UNICODE(result)
8799 + reslen - rescnt;
8800 }
8801 if (sign) {
8802 if (fill != ' ')
8803 *res++ = sign;
8804 rescnt--;
8805 if (width > len)
8806 width--;
8807 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008808 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008809 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008810 assert(pbuf[1] == c);
8811 if (fill != ' ') {
8812 *res++ = *pbuf++;
8813 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008814 }
Tim Petersfff53252001-04-12 18:38:48 +00008815 rescnt -= 2;
8816 width -= 2;
8817 if (width < 0)
8818 width = 0;
8819 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 if (width > len && !(flags & F_LJUST)) {
8822 do {
8823 --rescnt;
8824 *res++ = fill;
8825 } while (--width > len);
8826 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008827 if (fill == ' ') {
8828 if (sign)
8829 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008830 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008831 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008832 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008833 *res++ = *pbuf++;
8834 *res++ = *pbuf++;
8835 }
8836 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008837 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838 res += len;
8839 rescnt -= len;
8840 while (--width >= len) {
8841 --rescnt;
8842 *res++ = ' ';
8843 }
8844 if (dict && (argidx < arglen) && c != '%') {
8845 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008846 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008847 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 goto onError;
8849 }
8850 Py_XDECREF(temp);
8851 } /* '%' */
8852 } /* until end */
8853 if (argidx < arglen && !dict) {
8854 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008855 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856 goto onError;
8857 }
8858
Thomas Woutersa96affe2006-03-12 00:29:36 +00008859 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 if (args_owned) {
8862 Py_DECREF(args);
8863 }
8864 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 return (PyObject *)result;
8866
8867 onError:
8868 Py_XDECREF(result);
8869 Py_DECREF(uformat);
8870 if (args_owned) {
8871 Py_DECREF(args);
8872 }
8873 return NULL;
8874}
8875
Jeremy Hylton938ace62002-07-17 16:30:39 +00008876static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008877unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8878
Tim Peters6d6c1a32001-08-02 04:15:00 +00008879static PyObject *
8880unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8881{
8882 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008883 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008884 char *encoding = NULL;
8885 char *errors = NULL;
8886
Guido van Rossume023fe02001-08-30 03:12:59 +00008887 if (type != &PyUnicode_Type)
8888 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008889 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8890 kwlist, &x, &encoding, &errors))
8891 return NULL;
8892 if (x == NULL)
8893 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008894 if (encoding == NULL && errors == NULL)
8895 return PyObject_Unicode(x);
8896 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008897 return PyUnicode_FromEncodedObject(x, encoding, errors);
8898}
8899
Guido van Rossume023fe02001-08-30 03:12:59 +00008900static PyObject *
8901unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8902{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008903 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008904 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008905
8906 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8907 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8908 if (tmp == NULL)
8909 return NULL;
8910 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008911 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008912 if (pnew == NULL) {
8913 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008914 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008915 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008916 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8917 if (pnew->str == NULL) {
8918 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008919 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008920 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008921 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008922 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008923 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8924 pnew->length = n;
8925 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008926 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008927 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008928}
8929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008930PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008931"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008932\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008933Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008934encoding defaults to the current default string encoding.\n\
8935errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008936
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008937static PyObject *unicode_iter(PyObject *seq);
8938
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008940 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008941 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 sizeof(PyUnicodeObject), /* tp_size */
8943 0, /* tp_itemsize */
8944 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008945 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008947 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008949 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008950 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008951 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008953 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954 (hashfunc) unicode_hash, /* tp_hash*/
8955 0, /* tp_call*/
8956 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008957 PyObject_GenericGetAttr, /* tp_getattro */
8958 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00008959 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008960 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8961 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008962 unicode_doc, /* tp_doc */
8963 0, /* tp_traverse */
8964 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008965 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008966 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008967 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008968 0, /* tp_iternext */
8969 unicode_methods, /* tp_methods */
8970 0, /* tp_members */
8971 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008972 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008973 0, /* tp_dict */
8974 0, /* tp_descr_get */
8975 0, /* tp_descr_set */
8976 0, /* tp_dictoffset */
8977 0, /* tp_init */
8978 0, /* tp_alloc */
8979 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008980 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981};
8982
8983/* Initialize the Unicode implementation */
8984
Thomas Wouters78890102000-07-22 19:25:51 +00008985void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008987 int i;
8988
Thomas Wouters477c8d52006-05-27 19:21:47 +00008989 /* XXX - move this array to unicodectype.c ? */
8990 Py_UNICODE linebreak[] = {
8991 0x000A, /* LINE FEED */
8992 0x000D, /* CARRIAGE RETURN */
8993 0x001C, /* FILE SEPARATOR */
8994 0x001D, /* GROUP SEPARATOR */
8995 0x001E, /* RECORD SEPARATOR */
8996 0x0085, /* NEXT LINE */
8997 0x2028, /* LINE SEPARATOR */
8998 0x2029, /* PARAGRAPH SEPARATOR */
8999 };
9000
Fred Drakee4315f52000-05-09 19:53:39 +00009001 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009002 unicode_freelist = NULL;
9003 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009005 if (!unicode_empty)
9006 return;
9007
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009008 for (i = 0; i < 256; i++)
9009 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009010 if (PyType_Ready(&PyUnicode_Type) < 0)
9011 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009012
9013 /* initialize the linebreak bloom filter */
9014 bloom_linebreak = make_bloom_mask(
9015 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9016 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009017
9018 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019}
9020
9021/* Finalize the Unicode implementation */
9022
9023void
Thomas Wouters78890102000-07-22 19:25:51 +00009024_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009026 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009027 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009029 Py_XDECREF(unicode_empty);
9030 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009031
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009032 for (i = 0; i < 256; i++) {
9033 if (unicode_latin1[i]) {
9034 Py_DECREF(unicode_latin1[i]);
9035 unicode_latin1[i] = NULL;
9036 }
9037 }
9038
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009039 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 PyUnicodeObject *v = u;
9041 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009042 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009043 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009044 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009045 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009047 unicode_freelist = NULL;
9048 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009050
Walter Dörwald16807132007-05-25 13:52:07 +00009051void
9052PyUnicode_InternInPlace(PyObject **p)
9053{
9054 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9055 PyObject *t;
9056 if (s == NULL || !PyUnicode_Check(s))
9057 Py_FatalError(
9058 "PyUnicode_InternInPlace: unicode strings only please!");
9059 /* If it's a subclass, we don't really know what putting
9060 it in the interned dict might do. */
9061 if (!PyUnicode_CheckExact(s))
9062 return;
9063 if (PyUnicode_CHECK_INTERNED(s))
9064 return;
9065 if (interned == NULL) {
9066 interned = PyDict_New();
9067 if (interned == NULL) {
9068 PyErr_Clear(); /* Don't leave an exception */
9069 return;
9070 }
9071 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009072 /* It might be that the GetItem call fails even
9073 though the key is present in the dictionary,
9074 namely when this happens during a stack overflow. */
9075 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009076 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009077 Py_END_ALLOW_RECURSION
9078
Walter Dörwald16807132007-05-25 13:52:07 +00009079 if (t) {
9080 Py_INCREF(t);
9081 Py_DECREF(*p);
9082 *p = t;
9083 return;
9084 }
9085
Martin v. Löwis5b222132007-06-10 09:51:05 +00009086 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009087 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9088 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009089 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009090 return;
9091 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009092 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009093 /* The two references in interned are not counted by refcnt.
9094 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009095 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009096 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9097}
9098
9099void
9100PyUnicode_InternImmortal(PyObject **p)
9101{
9102 PyUnicode_InternInPlace(p);
9103 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9104 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9105 Py_INCREF(*p);
9106 }
9107}
9108
9109PyObject *
9110PyUnicode_InternFromString(const char *cp)
9111{
9112 PyObject *s = PyUnicode_FromString(cp);
9113 if (s == NULL)
9114 return NULL;
9115 PyUnicode_InternInPlace(&s);
9116 return s;
9117}
9118
9119void _Py_ReleaseInternedUnicodeStrings(void)
9120{
9121 PyObject *keys;
9122 PyUnicodeObject *s;
9123 Py_ssize_t i, n;
9124 Py_ssize_t immortal_size = 0, mortal_size = 0;
9125
9126 if (interned == NULL || !PyDict_Check(interned))
9127 return;
9128 keys = PyDict_Keys(interned);
9129 if (keys == NULL || !PyList_Check(keys)) {
9130 PyErr_Clear();
9131 return;
9132 }
9133
9134 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9135 detector, interned unicode strings are not forcibly deallocated;
9136 rather, we give them their stolen references back, and then clear
9137 and DECREF the interned dict. */
9138
9139 n = PyList_GET_SIZE(keys);
9140 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9141 n);
9142 for (i = 0; i < n; i++) {
9143 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9144 switch (s->state) {
9145 case SSTATE_NOT_INTERNED:
9146 /* XXX Shouldn't happen */
9147 break;
9148 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009149 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009150 immortal_size += s->length;
9151 break;
9152 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009153 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009154 mortal_size += s->length;
9155 break;
9156 default:
9157 Py_FatalError("Inconsistent interned string state.");
9158 }
9159 s->state = SSTATE_NOT_INTERNED;
9160 }
9161 fprintf(stderr, "total size of all interned strings: "
9162 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9163 "mortal/immortal\n", mortal_size, immortal_size);
9164 Py_DECREF(keys);
9165 PyDict_Clear(interned);
9166 Py_DECREF(interned);
9167 interned = NULL;
9168}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009169
9170
9171/********************* Unicode Iterator **************************/
9172
9173typedef struct {
9174 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009175 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009176 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9177} unicodeiterobject;
9178
9179static void
9180unicodeiter_dealloc(unicodeiterobject *it)
9181{
9182 _PyObject_GC_UNTRACK(it);
9183 Py_XDECREF(it->it_seq);
9184 PyObject_GC_Del(it);
9185}
9186
9187static int
9188unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9189{
9190 Py_VISIT(it->it_seq);
9191 return 0;
9192}
9193
9194static PyObject *
9195unicodeiter_next(unicodeiterobject *it)
9196{
9197 PyUnicodeObject *seq;
9198 PyObject *item;
9199
9200 assert(it != NULL);
9201 seq = it->it_seq;
9202 if (seq == NULL)
9203 return NULL;
9204 assert(PyUnicode_Check(seq));
9205
9206 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009207 item = PyUnicode_FromUnicode(
9208 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009209 if (item != NULL)
9210 ++it->it_index;
9211 return item;
9212 }
9213
9214 Py_DECREF(seq);
9215 it->it_seq = NULL;
9216 return NULL;
9217}
9218
9219static PyObject *
9220unicodeiter_len(unicodeiterobject *it)
9221{
9222 Py_ssize_t len = 0;
9223 if (it->it_seq)
9224 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9225 return PyInt_FromSsize_t(len);
9226}
9227
9228PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9229
9230static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009231 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9232 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009233 {NULL, NULL} /* sentinel */
9234};
9235
9236PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009237 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009238 "unicodeiterator", /* tp_name */
9239 sizeof(unicodeiterobject), /* tp_basicsize */
9240 0, /* tp_itemsize */
9241 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009242 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009243 0, /* tp_print */
9244 0, /* tp_getattr */
9245 0, /* tp_setattr */
9246 0, /* tp_compare */
9247 0, /* tp_repr */
9248 0, /* tp_as_number */
9249 0, /* tp_as_sequence */
9250 0, /* tp_as_mapping */
9251 0, /* tp_hash */
9252 0, /* tp_call */
9253 0, /* tp_str */
9254 PyObject_GenericGetAttr, /* tp_getattro */
9255 0, /* tp_setattro */
9256 0, /* tp_as_buffer */
9257 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9258 0, /* tp_doc */
9259 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9260 0, /* tp_clear */
9261 0, /* tp_richcompare */
9262 0, /* tp_weaklistoffset */
9263 PyObject_SelfIter, /* tp_iter */
9264 (iternextfunc)unicodeiter_next, /* tp_iternext */
9265 unicodeiter_methods, /* tp_methods */
9266 0,
9267};
9268
9269static PyObject *
9270unicode_iter(PyObject *seq)
9271{
9272 unicodeiterobject *it;
9273
9274 if (!PyUnicode_Check(seq)) {
9275 PyErr_BadInternalCall();
9276 return NULL;
9277 }
9278 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9279 if (it == NULL)
9280 return NULL;
9281 it->it_index = 0;
9282 Py_INCREF(seq);
9283 it->it_seq = (PyUnicodeObject *)seq;
9284 _PyObject_GC_TRACK(it);
9285 return (PyObject *)it;
9286}
9287
Martin v. Löwis5b222132007-06-10 09:51:05 +00009288size_t
9289Py_UNICODE_strlen(const Py_UNICODE *u)
9290{
9291 int res = 0;
9292 while(*u++)
9293 res++;
9294 return res;
9295}
9296
9297Py_UNICODE*
9298Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9299{
9300 Py_UNICODE *u = s1;
9301 while ((*u++ = *s2++));
9302 return s1;
9303}
9304
9305Py_UNICODE*
9306Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9307{
9308 Py_UNICODE *u = s1;
9309 while ((*u++ = *s2++))
9310 if (n-- == 0)
9311 break;
9312 return s1;
9313}
9314
9315int
9316Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9317{
9318 while (*s1 && *s2 && *s1 == *s2)
9319 s1++, s2++;
9320 if (*s1 && *s2)
9321 return (*s1 < *s2) ? -1 : +1;
9322 if (*s1)
9323 return 1;
9324 if (*s2)
9325 return -1;
9326 return 0;
9327}
9328
9329Py_UNICODE*
9330Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9331{
9332 const Py_UNICODE *p;
9333 for (p = s; *p; p++)
9334 if (*p == c)
9335 return (Py_UNICODE*)p;
9336 return NULL;
9337}
9338
9339
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009340#ifdef __cplusplus
9341}
9342#endif
9343
9344
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009345/*
9346Local variables:
9347c-basic-offset: 4
9348indent-tabs-mode: nil
9349End:
9350*/