blob: 1e8f63fcb4d9c555b48981d81f3c7841c1a698e5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
120 PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000122
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000124PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000126#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127 return 0x10FFFF;
128#else
129 /* This is actually an illegal character, so it should
130 not be passed to unichr. */
131 return 0xFFFF;
132#endif
133}
134
Thomas Wouters477c8d52006-05-27 19:21:47 +0000135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138 to keep things simple, we use a single bitmask, using the least 5
139 bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154 /* calculate simple bloom-style bitmask for a given unicode string */
155
156 long mask;
157 Py_ssize_t i;
158
159 mask = 0;
160 for (i = 0; i < len; i++)
161 mask |= (1 << (ptr[i] & 0x1F));
162
163 return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168 Py_ssize_t i;
169
170 for (i = 0; i < setlen; i++)
171 if (set[i] == chr)
172 return 1;
173
174 return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
Guido van Rossumd57fd912000-03-10 22:53:23 +0000180/* --- Unicode Object ----------------------------------------------------- */
181
182static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000184 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185{
186 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000187
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000191
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192 /* Resizing shared object (unicode_empty or single character
193 objects) in-place is not allowed. Use PyUnicode_Resize()
194 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000195
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 /* We allocate one more byte to make sure the string is Ux0000 terminated.
206 The overallocation is also used by fastsearch, which assumes that it's
207 safe to look at str[length] (without making any assumptions about what
208 it contains). */
209
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 oldstr = unicode->str;
211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 PyErr_NoMemory();
215 return -1;
216 }
217 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000222 if (unicode->defenc) {
223 Py_DECREF(unicode->defenc);
224 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000227
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 return 0;
229}
230
231/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000232 Ux0000 terminated; some code (e.g. new_identifier)
233 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234
235 XXX This allocator could further be enhanced by assuring that the
236 free list never reduces its size below 1.
237
238*/
239
240static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
243 register PyUnicodeObject *unicode;
244
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 if (length == 0 && unicode_empty != NULL) {
247 Py_INCREF(unicode_empty);
248 return unicode_empty;
249 }
250
251 /* Unicode freelist & memory allocation */
252 if (unicode_freelist) {
253 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000254 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 /* Keep-Alive optimization: we only upsize the buffer,
258 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000259 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000260 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000267 }
268 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 }
270 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 if (unicode == NULL)
273 return NULL;
274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000292 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000293 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000295
296 onError:
297 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000298 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300}
301
302static
Guido van Rossum9475a232001-10-05 20:51:39 +0000303void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304{
Walter Dörwald16807132007-05-25 13:52:07 +0000305 switch (PyUnicode_CHECK_INTERNED(unicode)) {
306 case SSTATE_NOT_INTERNED:
307 break;
308
309 case SSTATE_INTERNED_MORTAL:
310 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000311 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313 Py_FatalError(
314 "deletion of interned unicode string failed");
315 break;
316
317 case SSTATE_INTERNED_IMMORTAL:
318 Py_FatalError("Immortal interned unicode string died.");
319
320 default:
321 Py_FatalError("Inconsistent interned unicode string state.");
322 }
323
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 if (PyUnicode_CheckExact(unicode) &&
325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000326 /* Keep-Alive optimization */
327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000328 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 unicode->str = NULL;
330 unicode->length = 0;
331 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000332 if (unicode->defenc) {
333 Py_DECREF(unicode->defenc);
334 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000335 }
336 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 *(PyUnicodeObject **)unicode = unicode_freelist;
338 unicode_freelist = unicode;
339 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 }
341 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000342 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000343 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000344 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346}
347
Martin v. Löwis18e16552006-02-15 17:27:45 +0000348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349{
350 register PyUnicodeObject *v;
351
352 /* Argument checks */
353 if (unicode == NULL) {
354 PyErr_BadInternalCall();
355 return -1;
356 }
357 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 PyErr_BadInternalCall();
360 return -1;
361 }
362
363 /* Resizing unicode_empty and single character objects is not
364 possible since these are being shared. We simply return a fresh
365 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000366 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 (v == unicode_empty || v->length == 1)) {
368 PyUnicodeObject *w = _PyUnicode_New(length);
369 if (w == NULL)
370 return -1;
371 Py_UNICODE_COPY(w->str, v->str,
372 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000373 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000374 *unicode = (PyObject *)w;
375 return 0;
376 }
377
378 /* Note that we don't have to modify *unicode for unshared Unicode
379 objects, since we can modify them in-place. */
380 return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
390 PyUnicodeObject *unicode;
391
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000392 /* If the Unicode data is known at construction time, we can apply
393 some optimizations which share commonly used objects. */
394 if (u != NULL) {
395
396 /* Optimization for empty strings */
397 if (size == 0 && unicode_empty != NULL) {
398 Py_INCREF(unicode_empty);
399 return (PyObject *)unicode_empty;
400 }
401
402 /* Single character Unicode objects in the Latin-1 range are
403 shared when using this constructor */
404 if (size == 1 && *u < 256) {
405 unicode = unicode_latin1[*u];
406 if (!unicode) {
407 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 if (!unicode)
409 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000410 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 unicode_latin1[*u] = unicode;
412 }
413 Py_INCREF(unicode);
414 return (PyObject *)unicode;
415 }
416 }
Tim Petersced69f82003-09-16 20:30:58 +0000417
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 unicode = _PyUnicode_New(size);
419 if (!unicode)
420 return NULL;
421
422 /* Copy the Unicode data into the new object */
423 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425
426 return (PyObject *)unicode;
427}
428
Walter Dörwaldd2034312007-05-18 16:29:38 +0000429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430{
431 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000433 some optimizations which share commonly used objects.
434 Also, this means the input must be UTF-8, so fall back to the
435 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 if (u != NULL) {
437
438 /* Optimization for empty strings */
439 if (size == 0 && unicode_empty != NULL) {
440 Py_INCREF(unicode_empty);
441 return (PyObject *)unicode_empty;
442 }
443
Martin v. Löwis9c121062007-08-05 20:26:11 +0000444 /* Single characters are shared when using this constructor.
445 Restrict to ASCII, since the input must be UTF-8. */
446 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000447 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 if (!unicode) {
449 unicode = _PyUnicode_New(1);
450 if (!unicode)
451 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode->str[0] = Py_CHARMASK(*u);
453 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000454 }
455 Py_INCREF(unicode);
456 return (PyObject *)unicode;
457 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000458
459 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 }
461
Walter Dörwald55507312007-05-18 13:12:10 +0000462 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 if (!unicode)
464 return NULL;
465
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000466 return (PyObject *)unicode;
467}
468
Walter Dörwaldd2034312007-05-18 16:29:38 +0000469PyObject *PyUnicode_FromString(const char *u)
470{
471 size_t size = strlen(u);
472 if (size > PY_SSIZE_T_MAX) {
473 PyErr_SetString(PyExc_OverflowError, "input too long");
474 return NULL;
475 }
476
477 return PyUnicode_FromStringAndSize(u, size);
478}
479
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000483 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484{
485 PyUnicodeObject *unicode;
486
487 if (w == NULL) {
488 PyErr_BadInternalCall();
489 return NULL;
490 }
491
492 unicode = _PyUnicode_New(size);
493 if (!unicode)
494 return NULL;
495
496 /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000499#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 {
501 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000504 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 *u++ = *w++;
506 }
507#endif
508
509 return (PyObject *)unicode;
510}
511
Walter Dörwald346737f2007-05-31 10:44:43 +0000512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515 *fmt++ = '%';
516 if (width) {
517 if (zeropad)
518 *fmt++ = '0';
519 fmt += sprintf(fmt, "%d", width);
520 }
521 if (precision)
522 fmt += sprintf(fmt, ".%d", precision);
523 if (longflag)
524 *fmt++ = 'l';
525 else if (size_tflag) {
526 char *f = PY_FORMAT_SIZE_T;
527 while (*f)
528 *fmt++ = *f++;
529 }
530 *fmt++ = c;
531 *fmt = '\0';
532}
533
Walter Dörwaldd2034312007-05-18 16:29:38 +0000534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000540 Py_ssize_t callcount = 0;
541 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000542 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000543 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000544 int width = 0;
545 int precision = 0;
546 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 const char* f;
548 Py_UNICODE *s;
549 PyObject *string;
550 /* used by sprintf */
551 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000552 /* use abuffer instead of buffer, if we need more space
553 * (which can happen if there's a format specifier with width). */
554 char *abuffer = NULL;
555 char *realbuffer;
556 Py_ssize_t abuffersize = 0;
557 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000558 const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561 Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef __va_copy
564 __va_copy(count, vargs);
565#else
566 count = vargs;
567#endif
568#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000569 /* step 1: count the number of %S/%R format specifications
570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000574 ++callcount;
575 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 /* step 2: allocate memory for the results of
577 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 if (callcount) {
579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580 if (!callresults) {
581 PyErr_NoMemory();
582 return NULL;
583 }
584 callresult = callresults;
585 }
586 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000587 for (f = format; *f; f++) {
588 if (*f == '%') {
589 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000590 width = 0;
591 while (isdigit(Py_CHARMASK(*f)))
592 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594 ;
595
596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597 * they don't affect the amount of space we reserve.
598 */
599 if ((*f == 'l' || *f == 'z') &&
600 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000601 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000602
603 switch (*f) {
604 case 'c':
605 (void)va_arg(count, int);
606 /* fall through... */
607 case '%':
608 n++;
609 break;
610 case 'd': case 'u': case 'i': case 'x':
611 (void) va_arg(count, int);
612 /* 20 bytes is enough to hold a 64-bit
613 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000614 This isn't enough for octal.
615 If a width is specified we need more
616 (which we allocate later). */
617 if (width < 20)
618 width = 20;
619 n += width;
620 if (abuffersize < width)
621 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 break;
623 case 's':
624 n += strlen(va_arg(count, char*));
625 break;
626 case 'U':
627 {
628 PyObject *obj = va_arg(count, PyObject *);
629 assert(obj && PyUnicode_Check(obj));
630 n += PyUnicode_GET_SIZE(obj);
631 break;
632 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000633 case 'V':
634 {
635 PyObject *obj = va_arg(count, PyObject *);
636 const char *str = va_arg(count, const char *);
637 assert(obj || str);
638 assert(!obj || PyUnicode_Check(obj));
639 if (obj)
640 n += PyUnicode_GET_SIZE(obj);
641 else
642 n += strlen(str);
643 break;
644 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000645 case 'S':
646 {
647 PyObject *obj = va_arg(count, PyObject *);
648 PyObject *str;
649 assert(obj);
650 str = PyObject_Unicode(obj);
651 if (!str)
652 goto fail;
653 n += PyUnicode_GET_SIZE(str);
654 /* Remember the str and switch to the next slot */
655 *callresult++ = str;
656 break;
657 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000658 case 'R':
659 {
660 PyObject *obj = va_arg(count, PyObject *);
661 PyObject *repr;
662 assert(obj);
663 repr = PyObject_Repr(obj);
664 if (!repr)
665 goto fail;
666 n += PyUnicode_GET_SIZE(repr);
667 /* Remember the repr and switch to the next slot */
668 *callresult++ = repr;
669 break;
670 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000671 case 'p':
672 (void) va_arg(count, int);
673 /* maximum 64-bit pointer representation:
674 * 0xffffffffffffffff
675 * so 19 characters is enough.
676 * XXX I count 18 -- what's the extra for?
677 */
678 n += 19;
679 break;
680 default:
681 /* if we stumble upon an unknown
682 formatting code, copy the rest of
683 the format string to the output
684 string. (we cannot just skip the
685 code, since there's no way to know
686 what's in the argument list) */
687 n += strlen(p);
688 goto expand;
689 }
690 } else
691 n++;
692 }
693 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000694 if (abuffersize > 20) {
695 abuffer = PyMem_Malloc(abuffersize);
696 if (!abuffer) {
697 PyErr_NoMemory();
698 goto fail;
699 }
700 realbuffer = abuffer;
701 }
702 else
703 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000706 we don't have to resize the string.
707 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708 string = PyUnicode_FromUnicode(NULL, n);
709 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000710 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000713 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714
715 for (f = format; *f; f++) {
716 if (*f == '%') {
717 const char* p = f++;
718 int longflag = 0;
719 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000720 zeropad = (*f == '0');
721 /* parse the width.precision part */
722 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000724 width = (width*10) + *f++ - '0';
725 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 if (*f == '.') {
727 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000729 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000730 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 /* handle the long flag, but only for %ld and %lu.
732 others can be added when necessary. */
733 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
734 longflag = 1;
735 ++f;
736 }
737 /* handle the size_t flag. */
738 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
739 size_tflag = 1;
740 ++f;
741 }
742
743 switch (*f) {
744 case 'c':
745 *s++ = va_arg(vargs, int);
746 break;
747 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000754 sprintf(realbuffer, fmt, va_arg(vargs, int));
755 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 break;
757 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000764 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
765 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 break;
767 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000768 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
769 sprintf(realbuffer, fmt, va_arg(vargs, int));
770 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000771 break;
772 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000773 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
774 sprintf(realbuffer, fmt, va_arg(vargs, int));
775 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000776 break;
777 case 's':
778 p = va_arg(vargs, char*);
779 appendstring(p);
780 break;
781 case 'U':
782 {
783 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000784 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
785 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
786 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 break;
788 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000789 case 'V':
790 {
791 PyObject *obj = va_arg(vargs, PyObject *);
792 const char *str = va_arg(vargs, const char *);
793 if (obj) {
794 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
795 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
796 s += size;
797 } else {
798 appendstring(str);
799 }
800 break;
801 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000802 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000803 case 'R':
804 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000805 Py_UNICODE *ucopy;
806 Py_ssize_t usize;
807 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000808 /* unused, since we already have the result */
809 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000810 ucopy = PyUnicode_AS_UNICODE(*callresult);
811 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 for (upos = 0; upos<usize;)
813 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000816 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000817 ++callresult;
818 break;
819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820 case 'p':
821 sprintf(buffer, "%p", va_arg(vargs, void*));
822 /* %p is ill-defined: ensure leading 0x. */
823 if (buffer[1] == 'X')
824 buffer[1] = 'x';
825 else if (buffer[1] != 'x') {
826 memmove(buffer+2, buffer, strlen(buffer)+1);
827 buffer[0] = '0';
828 buffer[1] = 'x';
829 }
830 appendstring(buffer);
831 break;
832 case '%':
833 *s++ = '%';
834 break;
835 default:
836 appendstring(p);
837 goto end;
838 }
839 } else
840 *s++ = *f;
841 }
842
843 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 if (callresults)
845 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000846 if (abuffer)
847 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
849 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000850 fail:
851 if (callresults) {
852 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000853 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000854 Py_DECREF(*callresult2);
855 ++callresult2;
856 }
857 PyMem_Free(callresults);
858 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 if (abuffer)
860 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862}
863
864#undef appendstring
865
866PyObject *
867PyUnicode_FromFormat(const char *format, ...)
868{
869 PyObject* ret;
870 va_list vargs;
871
872#ifdef HAVE_STDARG_PROTOTYPES
873 va_start(vargs, format);
874#else
875 va_start(vargs);
876#endif
877 ret = PyUnicode_FromFormatV(format, vargs);
878 va_end(vargs);
879 return ret;
880}
881
Martin v. Löwis18e16552006-02-15 17:27:45 +0000882Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
883 wchar_t *w,
884 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
886 if (unicode == NULL) {
887 PyErr_BadInternalCall();
888 return -1;
889 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890
891 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000893 size = PyUnicode_GET_SIZE(unicode) + 1;
894
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895#ifdef HAVE_USABLE_WCHAR_T
896 memcpy(w, unicode->str, size * sizeof(wchar_t));
897#else
898 {
899 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000900 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000902 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 *w++ = *u++;
904 }
905#endif
906
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000907 if (size > PyUnicode_GET_SIZE(unicode))
908 return PyUnicode_GET_SIZE(unicode);
909 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000910 return size;
911}
912
913#endif
914
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915PyObject *PyUnicode_FromOrdinal(int ordinal)
916{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000917 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000918
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 if (ordinal < 0 || ordinal > 0x10ffff) {
920 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000922 return NULL;
923 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000924
925#ifndef Py_UNICODE_WIDE
926 if (ordinal > 0xffff) {
927 ordinal -= 0x10000;
928 s[0] = 0xD800 | (ordinal >> 10);
929 s[1] = 0xDC00 | (ordinal & 0x3FF);
930 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000931 }
932#endif
933
Hye-Shik Chang40574832004-04-06 07:24:51 +0000934 s[0] = (Py_UNICODE)ordinal;
935 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000936}
937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938PyObject *PyUnicode_FromObject(register PyObject *obj)
939{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000940 /* XXX Perhaps we should make this API an alias of
941 PyObject_Unicode() instead ?! */
942 if (PyUnicode_CheckExact(obj)) {
943 Py_INCREF(obj);
944 return obj;
945 }
946 if (PyUnicode_Check(obj)) {
947 /* For a Unicode subtype that's not a Unicode object,
948 return a true Unicode object with the same data. */
949 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
950 PyUnicode_GET_SIZE(obj));
951 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000952 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
953}
954
955PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
956 const char *encoding,
957 const char *errors)
958{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000959 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000961 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000962
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 if (obj == NULL) {
964 PyErr_BadInternalCall();
965 return NULL;
966 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000967
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000968#if 0
969 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000970 that no encodings is given and then redirect to
971 PyObject_Unicode() which then applies the additional logic for
972 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000973
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000974 NOTE: This API should really only be used for object which
975 represent *encoded* Unicode !
976
977 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000978 if (PyUnicode_Check(obj)) {
979 if (encoding) {
980 PyErr_SetString(PyExc_TypeError,
981 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000982 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000985 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000986#else
987 if (PyUnicode_Check(obj)) {
988 PyErr_SetString(PyExc_TypeError,
989 "decoding Unicode is not supported");
990 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000991 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000992#endif
993
994 /* Coerce object */
995 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000996 s = PyString_AS_STRING(obj);
997 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000998 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000999 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1000 /* Overwrite the error message with something more useful in
1001 case of a TypeError. */
1002 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001003 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001004 "coercing to Unicode: need string or buffer, "
1005 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001006 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001007 goto onError;
1008 }
Tim Petersced69f82003-09-16 20:30:58 +00001009
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 if (len == 0) {
1012 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 }
Tim Petersced69f82003-09-16 20:30:58 +00001015 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001017
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 return v;
1019
1020 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022}
1023
1024PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001025 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 const char *encoding,
1027 const char *errors)
1028{
1029 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001030
1031 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001032 encoding = PyUnicode_GetDefaultEncoding();
1033
1034 /* Shortcuts for common default encodings */
1035 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001037 else if (strcmp(encoding, "latin-1") == 0)
1038 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001039#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1040 else if (strcmp(encoding, "mbcs") == 0)
1041 return PyUnicode_DecodeMBCS(s, size, errors);
1042#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001043 else if (strcmp(encoding, "ascii") == 0)
1044 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045
1046 /* Decode via the codec registry */
1047 buffer = PyBuffer_FromMemory((void *)s, size);
1048 if (buffer == NULL)
1049 goto onError;
1050 unicode = PyCodec_Decode(buffer, encoding, errors);
1051 if (unicode == NULL)
1052 goto onError;
1053 if (!PyUnicode_Check(unicode)) {
1054 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001055 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001056 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 Py_DECREF(unicode);
1058 goto onError;
1059 }
1060 Py_DECREF(buffer);
1061 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063 onError:
1064 Py_XDECREF(buffer);
1065 return NULL;
1066}
1067
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001068PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1069 const char *encoding,
1070 const char *errors)
1071{
1072 PyObject *v;
1073
1074 if (!PyUnicode_Check(unicode)) {
1075 PyErr_BadArgument();
1076 goto onError;
1077 }
1078
1079 if (encoding == NULL)
1080 encoding = PyUnicode_GetDefaultEncoding();
1081
1082 /* Decode via the codec registry */
1083 v = PyCodec_Decode(unicode, encoding, errors);
1084 if (v == NULL)
1085 goto onError;
1086 return v;
1087
1088 onError:
1089 return NULL;
1090}
1091
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001093 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 const char *encoding,
1095 const char *errors)
1096{
1097 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001098
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 unicode = PyUnicode_FromUnicode(s, size);
1100 if (unicode == NULL)
1101 return NULL;
1102 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1103 Py_DECREF(unicode);
1104 return v;
1105}
1106
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001107PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1108 const char *encoding,
1109 const char *errors)
1110{
1111 PyObject *v;
1112
1113 if (!PyUnicode_Check(unicode)) {
1114 PyErr_BadArgument();
1115 goto onError;
1116 }
1117
1118 if (encoding == NULL)
1119 encoding = PyUnicode_GetDefaultEncoding();
1120
1121 /* Encode via the codec registry */
1122 v = PyCodec_Encode(unicode, encoding, errors);
1123 if (v == NULL)
1124 goto onError;
1125 return v;
1126
1127 onError:
1128 return NULL;
1129}
1130
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1132 const char *encoding,
1133 const char *errors)
1134{
1135 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137 if (!PyUnicode_Check(unicode)) {
1138 PyErr_BadArgument();
1139 goto onError;
1140 }
Fred Drakee4315f52000-05-09 19:53:39 +00001141
Tim Petersced69f82003-09-16 20:30:58 +00001142 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001143 encoding = PyUnicode_GetDefaultEncoding();
1144
1145 /* Shortcuts for common default encodings */
1146 if (errors == NULL) {
1147 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001148 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001149 else if (strcmp(encoding, "latin-1") == 0)
1150 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001151#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1152 else if (strcmp(encoding, "mbcs") == 0)
1153 return PyUnicode_AsMBCSString(unicode);
1154#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001155 else if (strcmp(encoding, "ascii") == 0)
1156 return PyUnicode_AsASCIIString(unicode);
1157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158
1159 /* Encode via the codec registry */
1160 v = PyCodec_Encode(unicode, encoding, errors);
1161 if (v == NULL)
1162 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001163 if (!PyBytes_Check(v)) {
1164 if (PyString_Check(v)) {
1165 /* Old codec, turn it into bytes */
1166 PyObject *b = PyBytes_FromObject(v);
1167 Py_DECREF(v);
1168 return b;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001171 "encoder did not return a bytes object "
1172 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1173 v->ob_type->tp_name,
1174 encoding ? encoding : "NULL",
1175 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 Py_DECREF(v);
1177 goto onError;
1178 }
1179 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001180
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 onError:
1182 return NULL;
1183}
1184
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001185PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1186 const char *errors)
1187{
1188 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001189 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001190 if (v)
1191 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001192 if (errors != NULL)
1193 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001194 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1195 PyUnicode_GET_SIZE(unicode),
1196 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001197 if (!b)
1198 return NULL;
1199 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1200 PyBytes_Size(b));
1201 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001202 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001203 return v;
1204}
1205
Martin v. Löwis5b222132007-06-10 09:51:05 +00001206char*
1207PyUnicode_AsString(PyObject *unicode)
1208{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_BadArgument();
1211 return NULL;
1212 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001213 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1214 if (!unicode)
1215 return NULL;
1216 return PyString_AsString(unicode);
1217}
1218
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1220{
1221 if (!PyUnicode_Check(unicode)) {
1222 PyErr_BadArgument();
1223 goto onError;
1224 }
1225 return PyUnicode_AS_UNICODE(unicode);
1226
1227 onError:
1228 return NULL;
1229}
1230
Martin v. Löwis18e16552006-02-15 17:27:45 +00001231Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232{
1233 if (!PyUnicode_Check(unicode)) {
1234 PyErr_BadArgument();
1235 goto onError;
1236 }
1237 return PyUnicode_GET_SIZE(unicode);
1238
1239 onError:
1240 return -1;
1241}
1242
Thomas Wouters78890102000-07-22 19:25:51 +00001243const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001244{
1245 return unicode_default_encoding;
1246}
1247
1248int PyUnicode_SetDefaultEncoding(const char *encoding)
1249{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001250 if (strcmp(encoding, unicode_default_encoding) != 0) {
1251 PyErr_Format(PyExc_ValueError,
1252 "Can only set default encoding to %s",
1253 unicode_default_encoding);
1254 return -1;
1255 }
Fred Drakee4315f52000-05-09 19:53:39 +00001256 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001257}
1258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001259/* error handling callback helper:
1260 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001261 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262 and adjust various state variables.
1263 return 0 on success, -1 on error
1264*/
1265
1266static
1267int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1268 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001269 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001270 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001271{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001272 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273
1274 PyObject *restuple = NULL;
1275 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001277 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001278 Py_ssize_t requiredsize;
1279 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001281 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001282 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001283 int res = -1;
1284
1285 if (*errorHandler == NULL) {
1286 *errorHandler = PyCodec_LookupError(errors);
1287 if (*errorHandler == NULL)
1288 goto onError;
1289 }
1290
1291 if (*exceptionObject == NULL) {
1292 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001293 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001294 if (*exceptionObject == NULL)
1295 goto onError;
1296 }
1297 else {
1298 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1299 goto onError;
1300 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1301 goto onError;
1302 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1303 goto onError;
1304 }
1305
1306 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1307 if (restuple == NULL)
1308 goto onError;
1309 if (!PyTuple_Check(restuple)) {
1310 PyErr_Format(PyExc_TypeError, &argparse[4]);
1311 goto onError;
1312 }
1313 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1314 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001315
1316 /* Copy back the bytes variables, which might have been modified by the
1317 callback */
1318 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1319 if (!inputobj)
1320 goto onError;
1321 if (!PyBytes_Check(inputobj)) {
1322 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1323 }
1324 *input = PyBytes_AS_STRING(inputobj);
1325 insize = PyBytes_GET_SIZE(inputobj);
1326 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001327 /* we can DECREF safely, as the exception has another reference,
1328 so the object won't go away. */
1329 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001332 newpos = insize+newpos;
1333 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001334 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001335 goto onError;
1336 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337
1338 /* need more space? (at least enough for what we
1339 have+the replacement+the rest of the string (starting
1340 at the new input position), so we won't have to check space
1341 when there are no errors in the rest of the string) */
1342 repptr = PyUnicode_AS_UNICODE(repunicode);
1343 repsize = PyUnicode_GET_SIZE(repunicode);
1344 requiredsize = *outpos + repsize + insize-newpos;
1345 if (requiredsize > outsize) {
1346 if (requiredsize<2*outsize)
1347 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001348 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349 goto onError;
1350 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1351 }
1352 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001353 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 Py_UNICODE_COPY(*outptr, repptr, repsize);
1355 *outptr += repsize;
1356 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001358 /* we made it! */
1359 res = 0;
1360
1361 onError:
1362 Py_XDECREF(restuple);
1363 return res;
1364}
1365
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001366/* --- UTF-7 Codec -------------------------------------------------------- */
1367
1368/* see RFC2152 for details */
1369
Tim Petersced69f82003-09-16 20:30:58 +00001370static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001371char utf7_special[128] = {
1372 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1373 encoded:
1374 0 - not special
1375 1 - special
1376 2 - whitespace (optional)
1377 3 - RFC2152 Set O (optional) */
1378 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1379 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1380 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1382 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1384 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1386
1387};
1388
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001389/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1390 warnings about the comparison always being false; since
1391 utf7_special[0] is 1, we can safely make that one comparison
1392 true */
1393
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001394#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001395 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001396 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001397 (encodeO && (utf7_special[(c)] == 3)))
1398
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001399#define B64(n) \
1400 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1401#define B64CHAR(c) \
1402 (isalnum(c) || (c) == '+' || (c) == '/')
1403#define UB64(c) \
1404 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1405 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001406
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001407#define ENCODE(out, ch, bits) \
1408 while (bits >= 6) { \
1409 *out++ = B64(ch >> (bits-6)); \
1410 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001411 }
1412
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001413#define DECODE(out, ch, bits, surrogate) \
1414 while (bits >= 16) { \
1415 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1416 bits -= 16; \
1417 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001418 /* We have already generated an error for the high surrogate \
1419 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001420 surrogate = 0; \
1421 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001422 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001423 it in a 16-bit character */ \
1424 surrogate = 1; \
1425 errmsg = "code pairs are not supported"; \
1426 goto utf7Error; \
1427 } else { \
1428 *out++ = outCh; \
1429 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001430 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001432PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001433 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434 const char *errors)
1435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001436 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001437 Py_ssize_t startinpos;
1438 Py_ssize_t endinpos;
1439 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001440 const char *e;
1441 PyUnicodeObject *unicode;
1442 Py_UNICODE *p;
1443 const char *errmsg = "";
1444 int inShift = 0;
1445 unsigned int bitsleft = 0;
1446 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001447 int surrogate = 0;
1448 PyObject *errorHandler = NULL;
1449 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001450
1451 unicode = _PyUnicode_New(size);
1452 if (!unicode)
1453 return NULL;
1454 if (size == 0)
1455 return (PyObject *)unicode;
1456
1457 p = unicode->str;
1458 e = s + size;
1459
1460 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001461 Py_UNICODE ch;
1462 restart:
1463 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001464
1465 if (inShift) {
1466 if ((ch == '-') || !B64CHAR(ch)) {
1467 inShift = 0;
1468 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001469
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001470 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1471 if (bitsleft >= 6) {
1472 /* The shift sequence has a partial character in it. If
1473 bitsleft < 6 then we could just classify it as padding
1474 but that is not the case here */
1475
1476 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001477 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478 }
1479 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001480 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 here so indicate the potential of a misencoded character. */
1482
1483 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1484 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1485 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001486 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001487 }
1488
1489 if (ch == '-') {
1490 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001491 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492 inShift = 1;
1493 }
1494 } else if (SPECIAL(ch,0,0)) {
1495 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001496 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001497 } else {
1498 *p++ = ch;
1499 }
1500 } else {
1501 charsleft = (charsleft << 6) | UB64(ch);
1502 bitsleft += 6;
1503 s++;
1504 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1505 }
1506 }
1507 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001508 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 s++;
1510 if (s < e && *s == '-') {
1511 s++;
1512 *p++ = '+';
1513 } else
1514 {
1515 inShift = 1;
1516 bitsleft = 0;
1517 }
1518 }
1519 else if (SPECIAL(ch,0,0)) {
1520 errmsg = "unexpected special character";
1521 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001522 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001523 }
1524 else {
1525 *p++ = ch;
1526 s++;
1527 }
1528 continue;
1529 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 outpos = p-PyUnicode_AS_UNICODE(unicode);
1531 endinpos = s-starts;
1532 if (unicode_decode_call_errorhandler(
1533 errors, &errorHandler,
1534 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001535 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 (PyObject **)&unicode, &outpos, &p))
1537 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 }
1539
1540 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 outpos = p-PyUnicode_AS_UNICODE(unicode);
1542 endinpos = size;
1543 if (unicode_decode_call_errorhandler(
1544 errors, &errorHandler,
1545 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001546 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 if (s < e)
1550 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 }
1552
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001553 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554 goto onError;
1555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 Py_XDECREF(errorHandler);
1557 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 return (PyObject *)unicode;
1559
1560onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001561 Py_XDECREF(errorHandler);
1562 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 Py_DECREF(unicode);
1564 return NULL;
1565}
1566
1567
1568PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001569 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570 int encodeSetO,
1571 int encodeWhiteSpace,
1572 const char *errors)
1573{
1574 PyObject *v;
1575 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001576 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 unsigned int bitsleft = 0;
1580 unsigned long charsleft = 0;
1581 char * out;
1582 char * start;
1583
1584 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001585 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001586
Walter Dörwald51ab4142007-05-05 14:43:36 +00001587 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588 if (v == NULL)
1589 return NULL;
1590
Walter Dörwald51ab4142007-05-05 14:43:36 +00001591 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592 for (;i < size; ++i) {
1593 Py_UNICODE ch = s[i];
1594
1595 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001596 if (ch == '+') {
1597 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598 *out++ = '-';
1599 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1600 charsleft = ch;
1601 bitsleft = 16;
1602 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001603 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001605 } else {
1606 *out++ = (char) ch;
1607 }
1608 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001609 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1610 *out++ = B64(charsleft << (6-bitsleft));
1611 charsleft = 0;
1612 bitsleft = 0;
1613 /* Characters not in the BASE64 set implicitly unshift the sequence
1614 so no '-' is required, except if the character is itself a '-' */
1615 if (B64CHAR(ch) || ch == '-') {
1616 *out++ = '-';
1617 }
1618 inShift = 0;
1619 *out++ = (char) ch;
1620 } else {
1621 bitsleft += 16;
1622 charsleft = (charsleft << 16) | ch;
1623 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1624
1625 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001626 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 or '-' then the shift sequence will be terminated implicitly and we
1628 don't have to insert a '-'. */
1629
1630 if (bitsleft == 0) {
1631 if (i + 1 < size) {
1632 Py_UNICODE ch2 = s[i+1];
1633
1634 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001635
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 } else if (B64CHAR(ch2) || ch2 == '-') {
1637 *out++ = '-';
1638 inShift = 0;
1639 } else {
1640 inShift = 0;
1641 }
1642
1643 }
1644 else {
1645 *out++ = '-';
1646 inShift = 0;
1647 }
1648 }
Tim Petersced69f82003-09-16 20:30:58 +00001649 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001651 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 if (bitsleft) {
1653 *out++= B64(charsleft << (6-bitsleft) );
1654 *out++ = '-';
1655 }
1656
Walter Dörwald51ab4142007-05-05 14:43:36 +00001657 if (PyBytes_Resize(v, out - start)) {
1658 Py_DECREF(v);
1659 return NULL;
1660 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001661 return v;
1662}
1663
1664#undef SPECIAL
1665#undef B64
1666#undef B64CHAR
1667#undef UB64
1668#undef ENCODE
1669#undef DECODE
1670
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671/* --- UTF-8 Codec -------------------------------------------------------- */
1672
Tim Petersced69f82003-09-16 20:30:58 +00001673static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674char utf8_code_length[256] = {
1675 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1676 illegal prefix. see RFC 2279 for details */
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1684 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1689 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1690 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1691 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1692 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1693};
1694
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001696 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697 const char *errors)
1698{
Walter Dörwald69652032004-09-07 20:24:22 +00001699 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1700}
1701
1702PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001703 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001704 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001705 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001706{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001707 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001709 Py_ssize_t startinpos;
1710 Py_ssize_t endinpos;
1711 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 const char *e;
1713 PyUnicodeObject *unicode;
1714 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001716 PyObject *errorHandler = NULL;
1717 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718
1719 /* Note: size will always be longer than the resulting Unicode
1720 character count */
1721 unicode = _PyUnicode_New(size);
1722 if (!unicode)
1723 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001724 if (size == 0) {
1725 if (consumed)
1726 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729
1730 /* Unpack UTF-8 encoded data */
1731 p = unicode->str;
1732 e = s + size;
1733
1734 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001735 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736
1737 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001738 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 s++;
1740 continue;
1741 }
1742
1743 n = utf8_code_length[ch];
1744
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001745 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001746 if (consumed)
1747 break;
1748 else {
1749 errmsg = "unexpected end of data";
1750 startinpos = s-starts;
1751 endinpos = size;
1752 goto utf8Error;
1753 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755
1756 switch (n) {
1757
1758 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001759 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 startinpos = s-starts;
1761 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001762 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763
1764 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001765 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001766 startinpos = s-starts;
1767 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001768 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769
1770 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 if ((s[1] & 0xc0) != 0x80) {
1772 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 startinpos = s-starts;
1774 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001775 goto utf8Error;
1776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 startinpos = s-starts;
1780 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001781 errmsg = "illegal encoding";
1782 goto utf8Error;
1783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001785 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 break;
1787
1788 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001789 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001790 (s[2] & 0xc0) != 0x80) {
1791 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 startinpos = s-starts;
1793 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001794 goto utf8Error;
1795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001797 if (ch < 0x0800) {
1798 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001799 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001800
1801 XXX For wide builds (UCS-4) we should probably try
1802 to recombine the surrogates into a single code
1803 unit.
1804 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001805 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 startinpos = s-starts;
1807 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001808 goto utf8Error;
1809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001811 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001812 break;
1813
1814 case 4:
1815 if ((s[1] & 0xc0) != 0x80 ||
1816 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 (s[3] & 0xc0) != 0x80) {
1818 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 startinpos = s-starts;
1820 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 goto utf8Error;
1822 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001823 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1824 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1825 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001826 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001827 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001828 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001829 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001830 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 startinpos = s-starts;
1833 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 goto utf8Error;
1835 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001836#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001837 *p++ = (Py_UNICODE)ch;
1838#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001839 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001840
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001841 /* translate from 10000..10FFFF to 0..FFFF */
1842 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001843
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001844 /* high surrogate = top 10 bits added to D800 */
1845 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001846
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001847 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001848 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001849#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 break;
1851
1852 default:
1853 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001854 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001855 startinpos = s-starts;
1856 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 }
1859 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001861
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001862 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 outpos = p-PyUnicode_AS_UNICODE(unicode);
1864 if (unicode_decode_call_errorhandler(
1865 errors, &errorHandler,
1866 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001867 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 (PyObject **)&unicode, &outpos, &p))
1869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 }
Walter Dörwald69652032004-09-07 20:24:22 +00001871 if (consumed)
1872 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
1874 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001875 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 goto onError;
1877
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 Py_XDECREF(errorHandler);
1879 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 return (PyObject *)unicode;
1881
1882onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 Py_XDECREF(errorHandler);
1884 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 Py_DECREF(unicode);
1886 return NULL;
1887}
1888
Tim Peters602f7402002-04-27 18:03:26 +00001889/* Allocation strategy: if the string is short, convert into a stack buffer
1890 and allocate exactly as much space needed at the end. Else allocate the
1891 maximum possible needed (4 result bytes per Unicode character), and return
1892 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001893*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001894PyObject *
1895PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001896 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001897 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898{
Tim Peters602f7402002-04-27 18:03:26 +00001899#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001900
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001902 PyObject *v; /* result string object */
1903 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001904 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001905 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001906 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001907
Tim Peters602f7402002-04-27 18:03:26 +00001908 assert(s != NULL);
1909 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910
Tim Peters602f7402002-04-27 18:03:26 +00001911 if (size <= MAX_SHORT_UNICHARS) {
1912 /* Write into the stack buffer; nallocated can't overflow.
1913 * At the end, we'll allocate exactly as much heap space as it
1914 * turns out we need.
1915 */
1916 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1917 v = NULL; /* will allocate after we're done */
1918 p = stackbuf;
1919 }
1920 else {
1921 /* Overallocate on the heap, and give the excess back at the end. */
1922 nallocated = size * 4;
1923 if (nallocated / 4 != size) /* overflow! */
1924 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001925 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001926 if (v == NULL)
1927 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001928 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001929 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001930
Tim Peters602f7402002-04-27 18:03:26 +00001931 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001932 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001933
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001934 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001935 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001937
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001939 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001940 *p++ = (char)(0xc0 | (ch >> 6));
1941 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001942 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001943 else {
Tim Peters602f7402002-04-27 18:03:26 +00001944 /* Encode UCS2 Unicode ordinals */
1945 if (ch < 0x10000) {
1946 /* Special case: check for high surrogate */
1947 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1948 Py_UCS4 ch2 = s[i];
1949 /* Check for low surrogate and combine the two to
1950 form a UCS4 value */
1951 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001952 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001953 i++;
1954 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001955 }
Tim Peters602f7402002-04-27 18:03:26 +00001956 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001957 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001958 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001959 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1960 *p++ = (char)(0x80 | (ch & 0x3f));
1961 continue;
1962 }
1963encodeUCS4:
1964 /* Encode UCS4 Unicode ordinals */
1965 *p++ = (char)(0xf0 | (ch >> 18));
1966 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1967 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1968 *p++ = (char)(0x80 | (ch & 0x3f));
1969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001971
Tim Peters602f7402002-04-27 18:03:26 +00001972 if (v == NULL) {
1973 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001974 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001975 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001976 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001977 }
1978 else {
1979 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001980 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001981 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001982 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001985
Tim Peters602f7402002-04-27 18:03:26 +00001986#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987}
1988
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1990{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991 if (!PyUnicode_Check(unicode)) {
1992 PyErr_BadArgument();
1993 return NULL;
1994 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001995 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1996 PyUnicode_GET_SIZE(unicode),
1997 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998}
1999
Walter Dörwald41980ca2007-08-16 21:55:45 +00002000/* --- UTF-32 Codec ------------------------------------------------------- */
2001
2002PyObject *
2003PyUnicode_DecodeUTF32(const char *s,
2004 Py_ssize_t size,
2005 const char *errors,
2006 int *byteorder)
2007{
2008 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2009}
2010
2011PyObject *
2012PyUnicode_DecodeUTF32Stateful(const char *s,
2013 Py_ssize_t size,
2014 const char *errors,
2015 int *byteorder,
2016 Py_ssize_t *consumed)
2017{
2018 const char *starts = s;
2019 Py_ssize_t startinpos;
2020 Py_ssize_t endinpos;
2021 Py_ssize_t outpos;
2022 PyUnicodeObject *unicode;
2023 Py_UNICODE *p;
2024#ifndef Py_UNICODE_WIDE
2025 int i, pairs;
2026#else
2027 const int pairs = 0;
2028#endif
2029 const unsigned char *q, *e;
2030 int bo = 0; /* assume native ordering by default */
2031 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002032 /* Offsets from q for retrieving bytes in the right order. */
2033#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2034 int iorder[] = {0, 1, 2, 3};
2035#else
2036 int iorder[] = {3, 2, 1, 0};
2037#endif
2038 PyObject *errorHandler = NULL;
2039 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002040 /* On narrow builds we split characters outside the BMP into two
2041 codepoints => count how much extra space we need. */
2042#ifndef Py_UNICODE_WIDE
2043 for (i = pairs = 0; i < size/4; i++)
2044 if (((Py_UCS4 *)s)[i] >= 0x10000)
2045 pairs++;
2046#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002047
2048 /* This might be one to much, because of a BOM */
2049 unicode = _PyUnicode_New((size+3)/4+pairs);
2050 if (!unicode)
2051 return NULL;
2052 if (size == 0)
2053 return (PyObject *)unicode;
2054
2055 /* Unpack UTF-32 encoded data */
2056 p = unicode->str;
2057 q = (unsigned char *)s;
2058 e = q + size;
2059
2060 if (byteorder)
2061 bo = *byteorder;
2062
2063 /* Check for BOM marks (U+FEFF) in the input and adjust current
2064 byte order setting accordingly. In native mode, the leading BOM
2065 mark is skipped, in all other modes, it is copied to the output
2066 stream as-is (giving a ZWNBSP character). */
2067 if (bo == 0) {
2068 if (size >= 4) {
2069 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2070 (q[iorder[1]] << 8) | q[iorder[0]];
2071#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2072 if (bom == 0x0000FEFF) {
2073 q += 4;
2074 bo = -1;
2075 }
2076 else if (bom == 0xFFFE0000) {
2077 q += 4;
2078 bo = 1;
2079 }
2080#else
2081 if (bom == 0x0000FEFF) {
2082 q += 4;
2083 bo = 1;
2084 }
2085 else if (bom == 0xFFFE0000) {
2086 q += 4;
2087 bo = -1;
2088 }
2089#endif
2090 }
2091 }
2092
2093 if (bo == -1) {
2094 /* force LE */
2095 iorder[0] = 0;
2096 iorder[1] = 1;
2097 iorder[2] = 2;
2098 iorder[3] = 3;
2099 }
2100 else if (bo == 1) {
2101 /* force BE */
2102 iorder[0] = 3;
2103 iorder[1] = 2;
2104 iorder[2] = 1;
2105 iorder[3] = 0;
2106 }
2107
2108 while (q < e) {
2109 Py_UCS4 ch;
2110 /* remaining bytes at the end? (size should be divisible by 4) */
2111 if (e-q<4) {
2112 if (consumed)
2113 break;
2114 errmsg = "truncated data";
2115 startinpos = ((const char *)q)-starts;
2116 endinpos = ((const char *)e)-starts;
2117 goto utf32Error;
2118 /* The remaining input chars are ignored if the callback
2119 chooses to skip the input */
2120 }
2121 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2122 (q[iorder[1]] << 8) | q[iorder[0]];
2123
2124 if (ch >= 0x110000)
2125 {
2126 errmsg = "codepoint not in range(0x110000)";
2127 startinpos = ((const char *)q)-starts;
2128 endinpos = startinpos+4;
2129 goto utf32Error;
2130 }
2131#ifndef Py_UNICODE_WIDE
2132 if (ch >= 0x10000)
2133 {
2134 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2135 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2136 }
2137 else
2138#endif
2139 *p++ = ch;
2140 q += 4;
2141 continue;
2142 utf32Error:
2143 outpos = p-PyUnicode_AS_UNICODE(unicode);
2144 if (unicode_decode_call_errorhandler(
2145 errors, &errorHandler,
2146 "utf32", errmsg,
2147 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2148 (PyObject **)&unicode, &outpos, &p))
2149 goto onError;
2150 }
2151
2152 if (byteorder)
2153 *byteorder = bo;
2154
2155 if (consumed)
2156 *consumed = (const char *)q-starts;
2157
2158 /* Adjust length */
2159 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2160 goto onError;
2161
2162 Py_XDECREF(errorHandler);
2163 Py_XDECREF(exc);
2164 return (PyObject *)unicode;
2165
2166onError:
2167 Py_DECREF(unicode);
2168 Py_XDECREF(errorHandler);
2169 Py_XDECREF(exc);
2170 return NULL;
2171}
2172
2173PyObject *
2174PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2175 Py_ssize_t size,
2176 const char *errors,
2177 int byteorder)
2178{
2179 PyObject *v;
2180 unsigned char *p;
2181#ifndef Py_UNICODE_WIDE
2182 int i, pairs;
2183#else
2184 const int pairs = 0;
2185#endif
2186 /* Offsets from p for storing byte pairs in the right order. */
2187#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2188 int iorder[] = {0, 1, 2, 3};
2189#else
2190 int iorder[] = {3, 2, 1, 0};
2191#endif
2192
2193#define STORECHAR(CH) \
2194 do { \
2195 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2196 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2197 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2198 p[iorder[0]] = (CH) & 0xff; \
2199 p += 4; \
2200 } while(0)
2201
2202 /* In narrow builds we can output surrogate pairs as one codepoint,
2203 so we need less space. */
2204#ifndef Py_UNICODE_WIDE
2205 for (i = pairs = 0; i < size-1; i++)
2206 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2207 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2208 pairs++;
2209#endif
2210 v = PyBytes_FromStringAndSize(NULL,
2211 4 * (size - pairs + (byteorder == 0)));
2212 if (v == NULL)
2213 return NULL;
2214
2215 p = (unsigned char *)PyBytes_AS_STRING(v);
2216 if (byteorder == 0)
2217 STORECHAR(0xFEFF);
2218 if (size == 0)
2219 return v;
2220
2221 if (byteorder == -1) {
2222 /* force LE */
2223 iorder[0] = 0;
2224 iorder[1] = 1;
2225 iorder[2] = 2;
2226 iorder[3] = 3;
2227 }
2228 else if (byteorder == 1) {
2229 /* force BE */
2230 iorder[0] = 3;
2231 iorder[1] = 2;
2232 iorder[2] = 1;
2233 iorder[3] = 0;
2234 }
2235
2236 while (size-- > 0) {
2237 Py_UCS4 ch = *s++;
2238#ifndef Py_UNICODE_WIDE
2239 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2240 Py_UCS4 ch2 = *s;
2241 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2242 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2243 s++;
2244 size--;
2245 }
2246 }
2247#endif
2248 STORECHAR(ch);
2249 }
2250 return v;
2251#undef STORECHAR
2252}
2253
2254PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2255{
2256 if (!PyUnicode_Check(unicode)) {
2257 PyErr_BadArgument();
2258 return NULL;
2259 }
2260 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2261 PyUnicode_GET_SIZE(unicode),
2262 NULL,
2263 0);
2264}
2265
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266/* --- UTF-16 Codec ------------------------------------------------------- */
2267
Tim Peters772747b2001-08-09 22:21:55 +00002268PyObject *
2269PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002270 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002271 const char *errors,
2272 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273{
Walter Dörwald69652032004-09-07 20:24:22 +00002274 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2275}
2276
2277PyObject *
2278PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002279 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002280 const char *errors,
2281 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002282 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002283{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002284 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002285 Py_ssize_t startinpos;
2286 Py_ssize_t endinpos;
2287 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 PyUnicodeObject *unicode;
2289 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002290 const unsigned char *q, *e;
2291 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002292 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002293 /* Offsets from q for retrieving byte pairs in the right order. */
2294#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2295 int ihi = 1, ilo = 0;
2296#else
2297 int ihi = 0, ilo = 1;
2298#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002299 PyObject *errorHandler = NULL;
2300 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 /* Note: size will always be longer than the resulting Unicode
2303 character count */
2304 unicode = _PyUnicode_New(size);
2305 if (!unicode)
2306 return NULL;
2307 if (size == 0)
2308 return (PyObject *)unicode;
2309
2310 /* Unpack UTF-16 encoded data */
2311 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002312 q = (unsigned char *)s;
2313 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314
2315 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002316 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002317
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002318 /* Check for BOM marks (U+FEFF) in the input and adjust current
2319 byte order setting accordingly. In native mode, the leading BOM
2320 mark is skipped, in all other modes, it is copied to the output
2321 stream as-is (giving a ZWNBSP character). */
2322 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002323 if (size >= 2) {
2324 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002325#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002326 if (bom == 0xFEFF) {
2327 q += 2;
2328 bo = -1;
2329 }
2330 else if (bom == 0xFFFE) {
2331 q += 2;
2332 bo = 1;
2333 }
Tim Petersced69f82003-09-16 20:30:58 +00002334#else
Walter Dörwald69652032004-09-07 20:24:22 +00002335 if (bom == 0xFEFF) {
2336 q += 2;
2337 bo = 1;
2338 }
2339 else if (bom == 0xFFFE) {
2340 q += 2;
2341 bo = -1;
2342 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002343#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002344 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346
Tim Peters772747b2001-08-09 22:21:55 +00002347 if (bo == -1) {
2348 /* force LE */
2349 ihi = 1;
2350 ilo = 0;
2351 }
2352 else if (bo == 1) {
2353 /* force BE */
2354 ihi = 0;
2355 ilo = 1;
2356 }
2357
2358 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002359 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002360 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002362 if (consumed)
2363 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002364 errmsg = "truncated data";
2365 startinpos = ((const char *)q)-starts;
2366 endinpos = ((const char *)e)-starts;
2367 goto utf16Error;
2368 /* The remaining input chars are ignored if the callback
2369 chooses to skip the input */
2370 }
2371 ch = (q[ihi] << 8) | q[ilo];
2372
Tim Peters772747b2001-08-09 22:21:55 +00002373 q += 2;
2374
Guido van Rossumd57fd912000-03-10 22:53:23 +00002375 if (ch < 0xD800 || ch > 0xDFFF) {
2376 *p++ = ch;
2377 continue;
2378 }
2379
2380 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002381 if (q >= e) {
2382 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002383 startinpos = (((const char *)q)-2)-starts;
2384 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002385 goto utf16Error;
2386 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002387 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002388 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2389 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002390 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002391#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002392 *p++ = ch;
2393 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002394#else
2395 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002396#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002397 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002398 }
2399 else {
2400 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002401 startinpos = (((const char *)q)-4)-starts;
2402 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002403 goto utf16Error;
2404 }
2405
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002407 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002408 startinpos = (((const char *)q)-2)-starts;
2409 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002410 /* Fall through to report the error */
2411
2412 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002413 outpos = p-PyUnicode_AS_UNICODE(unicode);
2414 if (unicode_decode_call_errorhandler(
2415 errors, &errorHandler,
2416 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002417 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002418 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420 }
2421
2422 if (byteorder)
2423 *byteorder = bo;
2424
Walter Dörwald69652032004-09-07 20:24:22 +00002425 if (consumed)
2426 *consumed = (const char *)q-starts;
2427
Guido van Rossumd57fd912000-03-10 22:53:23 +00002428 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002429 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430 goto onError;
2431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002432 Py_XDECREF(errorHandler);
2433 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434 return (PyObject *)unicode;
2435
2436onError:
2437 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002438 Py_XDECREF(errorHandler);
2439 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440 return NULL;
2441}
2442
Tim Peters772747b2001-08-09 22:21:55 +00002443PyObject *
2444PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002445 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002446 const char *errors,
2447 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448{
2449 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002450 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002451#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002452 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002453#else
2454 const int pairs = 0;
2455#endif
Tim Peters772747b2001-08-09 22:21:55 +00002456 /* Offsets from p for storing byte pairs in the right order. */
2457#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2458 int ihi = 1, ilo = 0;
2459#else
2460 int ihi = 0, ilo = 1;
2461#endif
2462
2463#define STORECHAR(CH) \
2464 do { \
2465 p[ihi] = ((CH) >> 8) & 0xff; \
2466 p[ilo] = (CH) & 0xff; \
2467 p += 2; \
2468 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002470#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002471 for (i = pairs = 0; i < size; i++)
2472 if (s[i] >= 0x10000)
2473 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002474#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002475 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002476 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477 if (v == NULL)
2478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479
Walter Dörwald3cc34522007-05-04 10:48:27 +00002480 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002482 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002483 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002484 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002485
2486 if (byteorder == -1) {
2487 /* force LE */
2488 ihi = 1;
2489 ilo = 0;
2490 }
2491 else if (byteorder == 1) {
2492 /* force BE */
2493 ihi = 0;
2494 ilo = 1;
2495 }
2496
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002497 while (size-- > 0) {
2498 Py_UNICODE ch = *s++;
2499 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002500#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002501 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002502 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2503 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002505#endif
Tim Peters772747b2001-08-09 22:21:55 +00002506 STORECHAR(ch);
2507 if (ch2)
2508 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002511#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512}
2513
2514PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2515{
2516 if (!PyUnicode_Check(unicode)) {
2517 PyErr_BadArgument();
2518 return NULL;
2519 }
2520 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2521 PyUnicode_GET_SIZE(unicode),
2522 NULL,
2523 0);
2524}
2525
2526/* --- Unicode Escape Codec ----------------------------------------------- */
2527
Fredrik Lundh06d12682001-01-24 07:59:11 +00002528static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002529
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002531 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 const char *errors)
2533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002535 Py_ssize_t startinpos;
2536 Py_ssize_t endinpos;
2537 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002540 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002542 char* message;
2543 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 PyObject *errorHandler = NULL;
2545 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002546
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 /* Escaped strings will always be longer than the resulting
2548 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 length after conversion to the true value.
2550 (but if the error callback returns a long replacement string
2551 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 v = _PyUnicode_New(size);
2553 if (v == NULL)
2554 goto onError;
2555 if (size == 0)
2556 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002560
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 while (s < end) {
2562 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002563 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565
2566 /* Non-escape characters are interpreted as Unicode ordinals */
2567 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002568 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569 continue;
2570 }
2571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002572 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573 /* \ - Escapes */
2574 s++;
2575 switch (*s++) {
2576
2577 /* \x escapes */
2578 case '\n': break;
2579 case '\\': *p++ = '\\'; break;
2580 case '\'': *p++ = '\''; break;
2581 case '\"': *p++ = '\"'; break;
2582 case 'b': *p++ = '\b'; break;
2583 case 'f': *p++ = '\014'; break; /* FF */
2584 case 't': *p++ = '\t'; break;
2585 case 'n': *p++ = '\n'; break;
2586 case 'r': *p++ = '\r'; break;
2587 case 'v': *p++ = '\013'; break; /* VT */
2588 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2589
2590 /* \OOO (octal) escapes */
2591 case '0': case '1': case '2': case '3':
2592 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002593 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002595 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002597 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002599 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 break;
2601
Fredrik Lundhccc74732001-02-18 22:13:49 +00002602 /* hex escapes */
2603 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002605 digits = 2;
2606 message = "truncated \\xXX escape";
2607 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608
Fredrik Lundhccc74732001-02-18 22:13:49 +00002609 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002611 digits = 4;
2612 message = "truncated \\uXXXX escape";
2613 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614
Fredrik Lundhccc74732001-02-18 22:13:49 +00002615 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002616 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002617 digits = 8;
2618 message = "truncated \\UXXXXXXXX escape";
2619 hexescape:
2620 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 outpos = p-PyUnicode_AS_UNICODE(v);
2622 if (s+digits>end) {
2623 endinpos = size;
2624 if (unicode_decode_call_errorhandler(
2625 errors, &errorHandler,
2626 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002627 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 (PyObject **)&v, &outpos, &p))
2629 goto onError;
2630 goto nextByte;
2631 }
2632 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002633 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002634 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002635 endinpos = (s+i+1)-starts;
2636 if (unicode_decode_call_errorhandler(
2637 errors, &errorHandler,
2638 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002639 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002641 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002642 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002643 }
2644 chr = (chr<<4) & ~0xF;
2645 if (c >= '0' && c <= '9')
2646 chr += c - '0';
2647 else if (c >= 'a' && c <= 'f')
2648 chr += 10 + c - 'a';
2649 else
2650 chr += 10 + c - 'A';
2651 }
2652 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002653 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002654 /* _decoding_error will have already written into the
2655 target buffer. */
2656 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002657 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002658 /* when we get here, chr is a 32-bit unicode character */
2659 if (chr <= 0xffff)
2660 /* UCS-2 character */
2661 *p++ = (Py_UNICODE) chr;
2662 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002663 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002664 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002665#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002666 *p++ = chr;
2667#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002668 chr -= 0x10000L;
2669 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002670 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002671#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002672 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002673 endinpos = s-starts;
2674 outpos = p-PyUnicode_AS_UNICODE(v);
2675 if (unicode_decode_call_errorhandler(
2676 errors, &errorHandler,
2677 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002678 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002680 goto onError;
2681 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002682 break;
2683
2684 /* \N{name} */
2685 case 'N':
2686 message = "malformed \\N character escape";
2687 if (ucnhash_CAPI == NULL) {
2688 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002689 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002690 m = PyImport_ImportModule("unicodedata");
2691 if (m == NULL)
2692 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002693 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002694 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002695 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002696 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002697 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002698 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002699 if (ucnhash_CAPI == NULL)
2700 goto ucnhashError;
2701 }
2702 if (*s == '{') {
2703 const char *start = s+1;
2704 /* look for the closing brace */
2705 while (*s != '}' && s < end)
2706 s++;
2707 if (s > start && s < end && *s == '}') {
2708 /* found a name. look it up in the unicode database */
2709 message = "unknown Unicode character name";
2710 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002711 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002712 goto store;
2713 }
2714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 endinpos = s-starts;
2716 outpos = p-PyUnicode_AS_UNICODE(v);
2717 if (unicode_decode_call_errorhandler(
2718 errors, &errorHandler,
2719 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002720 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002722 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002723 break;
2724
2725 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002726 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 message = "\\ at end of string";
2728 s--;
2729 endinpos = s-starts;
2730 outpos = p-PyUnicode_AS_UNICODE(v);
2731 if (unicode_decode_call_errorhandler(
2732 errors, &errorHandler,
2733 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002734 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002736 goto onError;
2737 }
2738 else {
2739 *p++ = '\\';
2740 *p++ = (unsigned char)s[-1];
2741 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002742 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 nextByte:
2745 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002747 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002749 Py_XDECREF(errorHandler);
2750 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002752
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002754 PyErr_SetString(
2755 PyExc_UnicodeError,
2756 "\\N escapes not supported (can't load unicodedata module)"
2757 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002758 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 Py_XDECREF(errorHandler);
2760 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002761 return NULL;
2762
Fredrik Lundhccc74732001-02-18 22:13:49 +00002763onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765 Py_XDECREF(errorHandler);
2766 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 return NULL;
2768}
2769
2770/* Return a Unicode-Escape string version of the Unicode object.
2771
2772 If quotes is true, the string is enclosed in u"" or u'' quotes as
2773 appropriate.
2774
2775*/
2776
Thomas Wouters477c8d52006-05-27 19:21:47 +00002777Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2778 Py_ssize_t size,
2779 Py_UNICODE ch)
2780{
2781 /* like wcschr, but doesn't stop at NULL characters */
2782
2783 while (size-- > 0) {
2784 if (*s == ch)
2785 return s;
2786 s++;
2787 }
2788
2789 return NULL;
2790}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002791
Walter Dörwald79e913e2007-05-12 11:08:06 +00002792static const char *hexdigits = "0123456789abcdef";
2793
2794PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2795 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796{
2797 PyObject *repr;
2798 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799
Thomas Wouters89f507f2006-12-13 04:49:30 +00002800 /* XXX(nnorwitz): rather than over-allocating, it would be
2801 better to choose a different scheme. Perhaps scan the
2802 first N-chars of the string and allocate based on that size.
2803 */
2804 /* Initial allocation is based on the longest-possible unichr
2805 escape.
2806
2807 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2808 unichr, so in this case it's the longest unichr escape. In
2809 narrow (UTF-16) builds this is five chars per source unichr
2810 since there are two unichrs in the surrogate pair, so in narrow
2811 (UTF-16) builds it's not the longest unichr escape.
2812
2813 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2814 so in the narrow (UTF-16) build case it's the longest unichr
2815 escape.
2816 */
2817
Walter Dörwald79e913e2007-05-12 11:08:06 +00002818 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002819#ifdef Py_UNICODE_WIDE
2820 + 10*size
2821#else
2822 + 6*size
2823#endif
2824 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 if (repr == NULL)
2826 return NULL;
2827
Walter Dörwald79e913e2007-05-12 11:08:06 +00002828 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 while (size-- > 0) {
2831 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002832
Walter Dörwald79e913e2007-05-12 11:08:06 +00002833 /* Escape backslashes */
2834 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 *p++ = '\\';
2836 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002837 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002838 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002839
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002840#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002841 /* Map 21-bit characters to '\U00xxxxxx' */
2842 else if (ch >= 0x10000) {
2843 *p++ = '\\';
2844 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002845 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2846 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2847 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2848 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2849 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2850 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2851 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2852 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002853 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002854 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002855#else
2856 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002857 else if (ch >= 0xD800 && ch < 0xDC00) {
2858 Py_UNICODE ch2;
2859 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002860
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002861 ch2 = *s++;
2862 size--;
2863 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2864 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2865 *p++ = '\\';
2866 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002867 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2868 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2869 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2870 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2871 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2872 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2873 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2874 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002875 continue;
2876 }
2877 /* Fall through: isolated surrogates are copied as-is */
2878 s--;
2879 size++;
2880 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002881#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002882
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002884 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 *p++ = '\\';
2886 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002887 *p++ = hexdigits[(ch >> 12) & 0x000F];
2888 *p++ = hexdigits[(ch >> 8) & 0x000F];
2889 *p++ = hexdigits[(ch >> 4) & 0x000F];
2890 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002892
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002893 /* Map special whitespace to '\t', \n', '\r' */
2894 else if (ch == '\t') {
2895 *p++ = '\\';
2896 *p++ = 't';
2897 }
2898 else if (ch == '\n') {
2899 *p++ = '\\';
2900 *p++ = 'n';
2901 }
2902 else if (ch == '\r') {
2903 *p++ = '\\';
2904 *p++ = 'r';
2905 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002906
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002907 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002908 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002910 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002911 *p++ = hexdigits[(ch >> 4) & 0x000F];
2912 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002913 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002914
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 /* Copy everything else as-is */
2916 else
2917 *p++ = (char) ch;
2918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919
2920 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002921 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2922 Py_DECREF(repr);
2923 return NULL;
2924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925 return repr;
2926}
2927
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2929{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002930 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931 if (!PyUnicode_Check(unicode)) {
2932 PyErr_BadArgument();
2933 return NULL;
2934 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002935 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2936 PyUnicode_GET_SIZE(unicode));
2937
2938 if (!s)
2939 return NULL;
2940 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2941 PyBytes_GET_SIZE(s));
2942 Py_DECREF(s);
2943 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944}
2945
2946/* --- Raw Unicode Escape Codec ------------------------------------------- */
2947
2948PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002949 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 const char *errors)
2951{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002953 Py_ssize_t startinpos;
2954 Py_ssize_t endinpos;
2955 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 const char *end;
2959 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 PyObject *errorHandler = NULL;
2961 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002962
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 /* Escaped strings will always be longer than the resulting
2964 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 length after conversion to the true value. (But decoding error
2966 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 v = _PyUnicode_New(size);
2968 if (v == NULL)
2969 goto onError;
2970 if (size == 0)
2971 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 end = s + size;
2974 while (s < end) {
2975 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002976 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002978 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979
2980 /* Non-escape characters are interpreted as Unicode ordinals */
2981 if (*s != '\\') {
2982 *p++ = (unsigned char)*s++;
2983 continue;
2984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986
2987 /* \u-escapes are only interpreted iff the number of leading
2988 backslashes if odd */
2989 bs = s;
2990 for (;s < end;) {
2991 if (*s != '\\')
2992 break;
2993 *p++ = (unsigned char)*s++;
2994 }
2995 if (((s - bs) & 1) == 0 ||
2996 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002997 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 continue;
2999 }
3000 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003001 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 s++;
3003
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003004 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003006 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003009 endinpos = s-starts;
3010 if (unicode_decode_call_errorhandler(
3011 errors, &errorHandler,
3012 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003013 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003014 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003016 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 }
3018 x = (x<<4) & ~0xF;
3019 if (c >= '0' && c <= '9')
3020 x += c - '0';
3021 else if (c >= 'a' && c <= 'f')
3022 x += 10 + c - 'a';
3023 else
3024 x += 10 + c - 'A';
3025 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003026#ifndef Py_UNICODE_WIDE
3027 if (x > 0x10000) {
3028 if (unicode_decode_call_errorhandler(
3029 errors, &errorHandler,
3030 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003031 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003032 (PyObject **)&v, &outpos, &p))
3033 goto onError;
3034 }
3035#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 *p++ = x;
3037 nextByte:
3038 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003040 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003041 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 Py_XDECREF(errorHandler);
3043 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003045
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 onError:
3047 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 Py_XDECREF(errorHandler);
3049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 return NULL;
3051}
3052
3053PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003054 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055{
3056 PyObject *repr;
3057 char *p;
3058 char *q;
3059
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003060#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003061 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003062#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003063 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003064#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 if (repr == NULL)
3066 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003067 if (size == 0)
3068 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069
Walter Dörwald711005d2007-05-12 12:03:26 +00003070 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 while (size-- > 0) {
3072 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003073#ifdef Py_UNICODE_WIDE
3074 /* Map 32-bit characters to '\Uxxxxxxxx' */
3075 if (ch >= 0x10000) {
3076 *p++ = '\\';
3077 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003078 *p++ = hexdigits[(ch >> 28) & 0xf];
3079 *p++ = hexdigits[(ch >> 24) & 0xf];
3080 *p++ = hexdigits[(ch >> 20) & 0xf];
3081 *p++ = hexdigits[(ch >> 16) & 0xf];
3082 *p++ = hexdigits[(ch >> 12) & 0xf];
3083 *p++ = hexdigits[(ch >> 8) & 0xf];
3084 *p++ = hexdigits[(ch >> 4) & 0xf];
3085 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003086 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003087 else
3088#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 /* Map 16-bit characters to '\uxxxx' */
3090 if (ch >= 256) {
3091 *p++ = '\\';
3092 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003093 *p++ = hexdigits[(ch >> 12) & 0xf];
3094 *p++ = hexdigits[(ch >> 8) & 0xf];
3095 *p++ = hexdigits[(ch >> 4) & 0xf];
3096 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 }
3098 /* Copy everything else as-is */
3099 else
3100 *p++ = (char) ch;
3101 }
3102 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003103 if (PyBytes_Resize(repr, p - q)) {
3104 Py_DECREF(repr);
3105 return NULL;
3106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 return repr;
3108}
3109
3110PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3111{
Walter Dörwald711005d2007-05-12 12:03:26 +00003112 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003114 PyErr_BadArgument();
3115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003117 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3118 PyUnicode_GET_SIZE(unicode));
3119
3120 if (!s)
3121 return NULL;
3122 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3123 PyBytes_GET_SIZE(s));
3124 Py_DECREF(s);
3125 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126}
3127
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003128/* --- Unicode Internal Codec ------------------------------------------- */
3129
3130PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003131 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003132 const char *errors)
3133{
3134 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003135 Py_ssize_t startinpos;
3136 Py_ssize_t endinpos;
3137 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003138 PyUnicodeObject *v;
3139 Py_UNICODE *p;
3140 const char *end;
3141 const char *reason;
3142 PyObject *errorHandler = NULL;
3143 PyObject *exc = NULL;
3144
Neal Norwitzd43069c2006-01-08 01:12:10 +00003145#ifdef Py_UNICODE_WIDE
3146 Py_UNICODE unimax = PyUnicode_GetMax();
3147#endif
3148
Thomas Wouters89f507f2006-12-13 04:49:30 +00003149 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003150 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3151 if (v == NULL)
3152 goto onError;
3153 if (PyUnicode_GetSize((PyObject *)v) == 0)
3154 return (PyObject *)v;
3155 p = PyUnicode_AS_UNICODE(v);
3156 end = s + size;
3157
3158 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003159 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003160 /* We have to sanity check the raw data, otherwise doom looms for
3161 some malformed UCS-4 data. */
3162 if (
3163 #ifdef Py_UNICODE_WIDE
3164 *p > unimax || *p < 0 ||
3165 #endif
3166 end-s < Py_UNICODE_SIZE
3167 )
3168 {
3169 startinpos = s - starts;
3170 if (end-s < Py_UNICODE_SIZE) {
3171 endinpos = end-starts;
3172 reason = "truncated input";
3173 }
3174 else {
3175 endinpos = s - starts + Py_UNICODE_SIZE;
3176 reason = "illegal code point (> 0x10FFFF)";
3177 }
3178 outpos = p - PyUnicode_AS_UNICODE(v);
3179 if (unicode_decode_call_errorhandler(
3180 errors, &errorHandler,
3181 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003182 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003183 (PyObject **)&v, &outpos, &p)) {
3184 goto onError;
3185 }
3186 }
3187 else {
3188 p++;
3189 s += Py_UNICODE_SIZE;
3190 }
3191 }
3192
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003193 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003194 goto onError;
3195 Py_XDECREF(errorHandler);
3196 Py_XDECREF(exc);
3197 return (PyObject *)v;
3198
3199 onError:
3200 Py_XDECREF(v);
3201 Py_XDECREF(errorHandler);
3202 Py_XDECREF(exc);
3203 return NULL;
3204}
3205
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206/* --- Latin-1 Codec ------------------------------------------------------ */
3207
3208PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003209 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 const char *errors)
3211{
3212 PyUnicodeObject *v;
3213 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003214
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003216 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003217 Py_UNICODE r = *(unsigned char*)s;
3218 return PyUnicode_FromUnicode(&r, 1);
3219 }
3220
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 v = _PyUnicode_New(size);
3222 if (v == NULL)
3223 goto onError;
3224 if (size == 0)
3225 return (PyObject *)v;
3226 p = PyUnicode_AS_UNICODE(v);
3227 while (size-- > 0)
3228 *p++ = (unsigned char)*s++;
3229 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003230
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 onError:
3232 Py_XDECREF(v);
3233 return NULL;
3234}
3235
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236/* create or adjust a UnicodeEncodeError */
3237static void make_encode_exception(PyObject **exceptionObject,
3238 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003239 const Py_UNICODE *unicode, Py_ssize_t size,
3240 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 if (*exceptionObject == NULL) {
3244 *exceptionObject = PyUnicodeEncodeError_Create(
3245 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 }
3247 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3249 goto onError;
3250 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3251 goto onError;
3252 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3253 goto onError;
3254 return;
3255 onError:
3256 Py_DECREF(*exceptionObject);
3257 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 }
3259}
3260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261/* raises a UnicodeEncodeError */
3262static void raise_encode_exception(PyObject **exceptionObject,
3263 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003264 const Py_UNICODE *unicode, Py_ssize_t size,
3265 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 const char *reason)
3267{
3268 make_encode_exception(exceptionObject,
3269 encoding, unicode, size, startpos, endpos, reason);
3270 if (*exceptionObject != NULL)
3271 PyCodec_StrictErrors(*exceptionObject);
3272}
3273
3274/* error handling callback helper:
3275 build arguments, call the callback and check the arguments,
3276 put the result into newpos and return the replacement string, which
3277 has to be freed by the caller */
3278static PyObject *unicode_encode_call_errorhandler(const char *errors,
3279 PyObject **errorHandler,
3280 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003281 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3282 Py_ssize_t startpos, Py_ssize_t endpos,
3283 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003285 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286
3287 PyObject *restuple;
3288 PyObject *resunicode;
3289
3290 if (*errorHandler == NULL) {
3291 *errorHandler = PyCodec_LookupError(errors);
3292 if (*errorHandler == NULL)
3293 return NULL;
3294 }
3295
3296 make_encode_exception(exceptionObject,
3297 encoding, unicode, size, startpos, endpos, reason);
3298 if (*exceptionObject == NULL)
3299 return NULL;
3300
3301 restuple = PyObject_CallFunctionObjArgs(
3302 *errorHandler, *exceptionObject, NULL);
3303 if (restuple == NULL)
3304 return NULL;
3305 if (!PyTuple_Check(restuple)) {
3306 PyErr_Format(PyExc_TypeError, &argparse[4]);
3307 Py_DECREF(restuple);
3308 return NULL;
3309 }
3310 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3311 &resunicode, newpos)) {
3312 Py_DECREF(restuple);
3313 return NULL;
3314 }
3315 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003316 *newpos = size+*newpos;
3317 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003318 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003319 Py_DECREF(restuple);
3320 return NULL;
3321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 Py_INCREF(resunicode);
3323 Py_DECREF(restuple);
3324 return resunicode;
3325}
3326
3327static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003328 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 const char *errors,
3330 int limit)
3331{
3332 /* output object */
3333 PyObject *res;
3334 /* pointers to the beginning and end+1 of input */
3335 const Py_UNICODE *startp = p;
3336 const Py_UNICODE *endp = p + size;
3337 /* pointer to the beginning of the unencodable characters */
3338 /* const Py_UNICODE *badp = NULL; */
3339 /* pointer into the output */
3340 char *str;
3341 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003342 Py_ssize_t respos = 0;
3343 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003344 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3345 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 PyObject *errorHandler = NULL;
3347 PyObject *exc = NULL;
3348 /* the following variable is used for caching string comparisons
3349 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3350 int known_errorHandler = -1;
3351
3352 /* allocate enough for a simple encoding without
3353 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003354 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 if (res == NULL)
3356 goto onError;
3357 if (size == 0)
3358 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003359 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360 ressize = size;
3361
3362 while (p<endp) {
3363 Py_UNICODE c = *p;
3364
3365 /* can we encode this? */
3366 if (c<limit) {
3367 /* no overflow check, because we know that the space is enough */
3368 *str++ = (char)c;
3369 ++p;
3370 }
3371 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003372 Py_ssize_t unicodepos = p-startp;
3373 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 Py_ssize_t repsize;
3376 Py_ssize_t newpos;
3377 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 Py_UNICODE *uni2;
3379 /* startpos for collecting unencodable chars */
3380 const Py_UNICODE *collstart = p;
3381 const Py_UNICODE *collend = p;
3382 /* find all unecodable characters */
3383 while ((collend < endp) && ((*collend)>=limit))
3384 ++collend;
3385 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3386 if (known_errorHandler==-1) {
3387 if ((errors==NULL) || (!strcmp(errors, "strict")))
3388 known_errorHandler = 1;
3389 else if (!strcmp(errors, "replace"))
3390 known_errorHandler = 2;
3391 else if (!strcmp(errors, "ignore"))
3392 known_errorHandler = 3;
3393 else if (!strcmp(errors, "xmlcharrefreplace"))
3394 known_errorHandler = 4;
3395 else
3396 known_errorHandler = 0;
3397 }
3398 switch (known_errorHandler) {
3399 case 1: /* strict */
3400 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3401 goto onError;
3402 case 2: /* replace */
3403 while (collstart++<collend)
3404 *str++ = '?'; /* fall through */
3405 case 3: /* ignore */
3406 p = collend;
3407 break;
3408 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003409 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 /* determine replacement size (temporarily (mis)uses p) */
3411 for (p = collstart, repsize = 0; p < collend; ++p) {
3412 if (*p<10)
3413 repsize += 2+1+1;
3414 else if (*p<100)
3415 repsize += 2+2+1;
3416 else if (*p<1000)
3417 repsize += 2+3+1;
3418 else if (*p<10000)
3419 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003420#ifndef Py_UNICODE_WIDE
3421 else
3422 repsize += 2+5+1;
3423#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 else if (*p<100000)
3425 repsize += 2+5+1;
3426 else if (*p<1000000)
3427 repsize += 2+6+1;
3428 else
3429 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003430#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 }
3432 requiredsize = respos+repsize+(endp-collend);
3433 if (requiredsize > ressize) {
3434 if (requiredsize<2*ressize)
3435 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003436 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003438 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 ressize = requiredsize;
3440 }
3441 /* generate replacement (temporarily (mis)uses p) */
3442 for (p = collstart; p < collend; ++p) {
3443 str += sprintf(str, "&#%d;", (int)*p);
3444 }
3445 p = collend;
3446 break;
3447 default:
3448 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3449 encoding, reason, startp, size, &exc,
3450 collstart-startp, collend-startp, &newpos);
3451 if (repunicode == NULL)
3452 goto onError;
3453 /* need more space? (at least enough for what we
3454 have+the replacement+the rest of the string, so
3455 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003456 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 repsize = PyUnicode_GET_SIZE(repunicode);
3458 requiredsize = respos+repsize+(endp-collend);
3459 if (requiredsize > ressize) {
3460 if (requiredsize<2*ressize)
3461 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003462 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 Py_DECREF(repunicode);
3464 goto onError;
3465 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003466 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 ressize = requiredsize;
3468 }
3469 /* check if there is anything unencodable in the replacement
3470 and copy it to the output */
3471 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3472 c = *uni2;
3473 if (c >= limit) {
3474 raise_encode_exception(&exc, encoding, startp, size,
3475 unicodepos, unicodepos+1, reason);
3476 Py_DECREF(repunicode);
3477 goto onError;
3478 }
3479 *str = (char)c;
3480 }
3481 p = startp + newpos;
3482 Py_DECREF(repunicode);
3483 }
3484 }
3485 }
3486 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003487 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 if (respos<ressize)
3489 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003490 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 Py_XDECREF(errorHandler);
3492 Py_XDECREF(exc);
3493 return res;
3494
3495 onError:
3496 Py_XDECREF(res);
3497 Py_XDECREF(errorHandler);
3498 Py_XDECREF(exc);
3499 return NULL;
3500}
3501
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003503 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 const char *errors)
3505{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507}
3508
3509PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3510{
3511 if (!PyUnicode_Check(unicode)) {
3512 PyErr_BadArgument();
3513 return NULL;
3514 }
3515 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3516 PyUnicode_GET_SIZE(unicode),
3517 NULL);
3518}
3519
3520/* --- 7-bit ASCII Codec -------------------------------------------------- */
3521
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003523 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 const char *errors)
3525{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 PyUnicodeObject *v;
3528 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003529 Py_ssize_t startinpos;
3530 Py_ssize_t endinpos;
3531 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 const char *e;
3533 PyObject *errorHandler = NULL;
3534 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003535
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003537 if (size == 1 && *(unsigned char*)s < 128) {
3538 Py_UNICODE r = *(unsigned char*)s;
3539 return PyUnicode_FromUnicode(&r, 1);
3540 }
Tim Petersced69f82003-09-16 20:30:58 +00003541
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 v = _PyUnicode_New(size);
3543 if (v == NULL)
3544 goto onError;
3545 if (size == 0)
3546 return (PyObject *)v;
3547 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 e = s + size;
3549 while (s < e) {
3550 register unsigned char c = (unsigned char)*s;
3551 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 ++s;
3554 }
3555 else {
3556 startinpos = s-starts;
3557 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003558 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 if (unicode_decode_call_errorhandler(
3560 errors, &errorHandler,
3561 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003562 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003567 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003568 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003569 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 Py_XDECREF(errorHandler);
3571 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003573
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 onError:
3575 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 Py_XDECREF(errorHandler);
3577 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 return NULL;
3579}
3580
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003582 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 const char *errors)
3584{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586}
3587
3588PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3589{
3590 if (!PyUnicode_Check(unicode)) {
3591 PyErr_BadArgument();
3592 return NULL;
3593 }
3594 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3595 PyUnicode_GET_SIZE(unicode),
3596 NULL);
3597}
3598
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003599#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003600
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003601/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003602
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003603#if SIZEOF_INT < SIZEOF_SSIZE_T
3604#define NEED_RETRY
3605#endif
3606
3607/* XXX This code is limited to "true" double-byte encodings, as
3608 a) it assumes an incomplete character consists of a single byte, and
3609 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3610 encodings, see IsDBCSLeadByteEx documentation. */
3611
3612static int is_dbcs_lead_byte(const char *s, int offset)
3613{
3614 const char *curr = s + offset;
3615
3616 if (IsDBCSLeadByte(*curr)) {
3617 const char *prev = CharPrev(s, curr);
3618 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3619 }
3620 return 0;
3621}
3622
3623/*
3624 * Decode MBCS string into unicode object. If 'final' is set, converts
3625 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3626 */
3627static int decode_mbcs(PyUnicodeObject **v,
3628 const char *s, /* MBCS string */
3629 int size, /* sizeof MBCS string */
3630 int final)
3631{
3632 Py_UNICODE *p;
3633 Py_ssize_t n = 0;
3634 int usize = 0;
3635
3636 assert(size >= 0);
3637
3638 /* Skip trailing lead-byte unless 'final' is set */
3639 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3640 --size;
3641
3642 /* First get the size of the result */
3643 if (size > 0) {
3644 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3645 if (usize == 0) {
3646 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3647 return -1;
3648 }
3649 }
3650
3651 if (*v == NULL) {
3652 /* Create unicode object */
3653 *v = _PyUnicode_New(usize);
3654 if (*v == NULL)
3655 return -1;
3656 }
3657 else {
3658 /* Extend unicode object */
3659 n = PyUnicode_GET_SIZE(*v);
3660 if (_PyUnicode_Resize(v, n + usize) < 0)
3661 return -1;
3662 }
3663
3664 /* Do the conversion */
3665 if (size > 0) {
3666 p = PyUnicode_AS_UNICODE(*v) + n;
3667 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3668 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3669 return -1;
3670 }
3671 }
3672
3673 return size;
3674}
3675
3676PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3677 Py_ssize_t size,
3678 const char *errors,
3679 Py_ssize_t *consumed)
3680{
3681 PyUnicodeObject *v = NULL;
3682 int done;
3683
3684 if (consumed)
3685 *consumed = 0;
3686
3687#ifdef NEED_RETRY
3688 retry:
3689 if (size > INT_MAX)
3690 done = decode_mbcs(&v, s, INT_MAX, 0);
3691 else
3692#endif
3693 done = decode_mbcs(&v, s, (int)size, !consumed);
3694
3695 if (done < 0) {
3696 Py_XDECREF(v);
3697 return NULL;
3698 }
3699
3700 if (consumed)
3701 *consumed += done;
3702
3703#ifdef NEED_RETRY
3704 if (size > INT_MAX) {
3705 s += done;
3706 size -= done;
3707 goto retry;
3708 }
3709#endif
3710
3711 return (PyObject *)v;
3712}
3713
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003714PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003715 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003716 const char *errors)
3717{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003718 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3719}
3720
3721/*
3722 * Convert unicode into string object (MBCS).
3723 * Returns 0 if succeed, -1 otherwise.
3724 */
3725static int encode_mbcs(PyObject **repr,
3726 const Py_UNICODE *p, /* unicode */
3727 int size) /* size of unicode */
3728{
3729 int mbcssize = 0;
3730 Py_ssize_t n = 0;
3731
3732 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003733
3734 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003735 if (size > 0) {
3736 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3737 if (mbcssize == 0) {
3738 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3739 return -1;
3740 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003741 }
3742
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003743 if (*repr == NULL) {
3744 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003745 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003746 if (*repr == NULL)
3747 return -1;
3748 }
3749 else {
3750 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003751 n = PyBytes_Size(*repr);
3752 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003753 return -1;
3754 }
3755
3756 /* Do the conversion */
3757 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003758 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003759 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3760 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3761 return -1;
3762 }
3763 }
3764
3765 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003766}
3767
3768PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003769 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003770 const char *errors)
3771{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003772 PyObject *repr = NULL;
3773 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003774
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003775#ifdef NEED_RETRY
3776 retry:
3777 if (size > INT_MAX)
3778 ret = encode_mbcs(&repr, p, INT_MAX);
3779 else
3780#endif
3781 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003782
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003783 if (ret < 0) {
3784 Py_XDECREF(repr);
3785 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003786 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003787
3788#ifdef NEED_RETRY
3789 if (size > INT_MAX) {
3790 p += INT_MAX;
3791 size -= INT_MAX;
3792 goto retry;
3793 }
3794#endif
3795
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003796 return repr;
3797}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003798
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003799PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3800{
3801 if (!PyUnicode_Check(unicode)) {
3802 PyErr_BadArgument();
3803 return NULL;
3804 }
3805 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3806 PyUnicode_GET_SIZE(unicode),
3807 NULL);
3808}
3809
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003810#undef NEED_RETRY
3811
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003812#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814/* --- Character Mapping Codec -------------------------------------------- */
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003817 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 PyObject *mapping,
3819 const char *errors)
3820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003822 Py_ssize_t startinpos;
3823 Py_ssize_t endinpos;
3824 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 PyUnicodeObject *v;
3827 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829 PyObject *errorHandler = NULL;
3830 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003831 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003832 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003833
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 /* Default to Latin-1 */
3835 if (mapping == NULL)
3836 return PyUnicode_DecodeLatin1(s, size, errors);
3837
3838 v = _PyUnicode_New(size);
3839 if (v == NULL)
3840 goto onError;
3841 if (size == 0)
3842 return (PyObject *)v;
3843 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003845 if (PyUnicode_CheckExact(mapping)) {
3846 mapstring = PyUnicode_AS_UNICODE(mapping);
3847 maplen = PyUnicode_GET_SIZE(mapping);
3848 while (s < e) {
3849 unsigned char ch = *s;
3850 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003852 if (ch < maplen)
3853 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003855 if (x == 0xfffe) {
3856 /* undefined mapping */
3857 outpos = p-PyUnicode_AS_UNICODE(v);
3858 startinpos = s-starts;
3859 endinpos = startinpos+1;
3860 if (unicode_decode_call_errorhandler(
3861 errors, &errorHandler,
3862 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003863 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003864 (PyObject **)&v, &outpos, &p)) {
3865 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003866 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003867 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003868 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003869 *p++ = x;
3870 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003872 }
3873 else {
3874 while (s < e) {
3875 unsigned char ch = *s;
3876 PyObject *w, *x;
3877
3878 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3879 w = PyInt_FromLong((long)ch);
3880 if (w == NULL)
3881 goto onError;
3882 x = PyObject_GetItem(mapping, w);
3883 Py_DECREF(w);
3884 if (x == NULL) {
3885 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3886 /* No mapping found means: mapping is undefined. */
3887 PyErr_Clear();
3888 x = Py_None;
3889 Py_INCREF(x);
3890 } else
3891 goto onError;
3892 }
3893
3894 /* Apply mapping */
3895 if (PyInt_Check(x)) {
3896 long value = PyInt_AS_LONG(x);
3897 if (value < 0 || value > 65535) {
3898 PyErr_SetString(PyExc_TypeError,
3899 "character mapping must be in range(65536)");
3900 Py_DECREF(x);
3901 goto onError;
3902 }
3903 *p++ = (Py_UNICODE)value;
3904 }
3905 else if (x == Py_None) {
3906 /* undefined mapping */
3907 outpos = p-PyUnicode_AS_UNICODE(v);
3908 startinpos = s-starts;
3909 endinpos = startinpos+1;
3910 if (unicode_decode_call_errorhandler(
3911 errors, &errorHandler,
3912 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003913 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003914 (PyObject **)&v, &outpos, &p)) {
3915 Py_DECREF(x);
3916 goto onError;
3917 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003918 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003919 continue;
3920 }
3921 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003923
3924 if (targetsize == 1)
3925 /* 1-1 mapping */
3926 *p++ = *PyUnicode_AS_UNICODE(x);
3927
3928 else if (targetsize > 1) {
3929 /* 1-n mapping */
3930 if (targetsize > extrachars) {
3931 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3933 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003934 (targetsize << 2);
3935 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003936 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003937 if (_PyUnicode_Resize(&v,
3938 PyUnicode_GET_SIZE(v) + needed) < 0) {
3939 Py_DECREF(x);
3940 goto onError;
3941 }
3942 p = PyUnicode_AS_UNICODE(v) + oldpos;
3943 }
3944 Py_UNICODE_COPY(p,
3945 PyUnicode_AS_UNICODE(x),
3946 targetsize);
3947 p += targetsize;
3948 extrachars -= targetsize;
3949 }
3950 /* 1-0 mapping: skip the character */
3951 }
3952 else {
3953 /* wrong return value */
3954 PyErr_SetString(PyExc_TypeError,
3955 "character mapping must return integer, None or unicode");
3956 Py_DECREF(x);
3957 goto onError;
3958 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003960 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 }
3963 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003964 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 Py_XDECREF(errorHandler);
3967 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003969
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 Py_XDECREF(errorHandler);
3972 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 Py_XDECREF(v);
3974 return NULL;
3975}
3976
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003977/* Charmap encoding: the lookup table */
3978
3979struct encoding_map{
3980 PyObject_HEAD
3981 unsigned char level1[32];
3982 int count2, count3;
3983 unsigned char level23[1];
3984};
3985
3986static PyObject*
3987encoding_map_size(PyObject *obj, PyObject* args)
3988{
3989 struct encoding_map *map = (struct encoding_map*)obj;
3990 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3991 128*map->count3);
3992}
3993
3994static PyMethodDef encoding_map_methods[] = {
3995 {"size", encoding_map_size, METH_NOARGS,
3996 PyDoc_STR("Return the size (in bytes) of this object") },
3997 { 0 }
3998};
3999
4000static void
4001encoding_map_dealloc(PyObject* o)
4002{
4003 PyObject_FREE(o);
4004}
4005
4006static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004007 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004008 "EncodingMap", /*tp_name*/
4009 sizeof(struct encoding_map), /*tp_basicsize*/
4010 0, /*tp_itemsize*/
4011 /* methods */
4012 encoding_map_dealloc, /*tp_dealloc*/
4013 0, /*tp_print*/
4014 0, /*tp_getattr*/
4015 0, /*tp_setattr*/
4016 0, /*tp_compare*/
4017 0, /*tp_repr*/
4018 0, /*tp_as_number*/
4019 0, /*tp_as_sequence*/
4020 0, /*tp_as_mapping*/
4021 0, /*tp_hash*/
4022 0, /*tp_call*/
4023 0, /*tp_str*/
4024 0, /*tp_getattro*/
4025 0, /*tp_setattro*/
4026 0, /*tp_as_buffer*/
4027 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4028 0, /*tp_doc*/
4029 0, /*tp_traverse*/
4030 0, /*tp_clear*/
4031 0, /*tp_richcompare*/
4032 0, /*tp_weaklistoffset*/
4033 0, /*tp_iter*/
4034 0, /*tp_iternext*/
4035 encoding_map_methods, /*tp_methods*/
4036 0, /*tp_members*/
4037 0, /*tp_getset*/
4038 0, /*tp_base*/
4039 0, /*tp_dict*/
4040 0, /*tp_descr_get*/
4041 0, /*tp_descr_set*/
4042 0, /*tp_dictoffset*/
4043 0, /*tp_init*/
4044 0, /*tp_alloc*/
4045 0, /*tp_new*/
4046 0, /*tp_free*/
4047 0, /*tp_is_gc*/
4048};
4049
4050PyObject*
4051PyUnicode_BuildEncodingMap(PyObject* string)
4052{
4053 Py_UNICODE *decode;
4054 PyObject *result;
4055 struct encoding_map *mresult;
4056 int i;
4057 int need_dict = 0;
4058 unsigned char level1[32];
4059 unsigned char level2[512];
4060 unsigned char *mlevel1, *mlevel2, *mlevel3;
4061 int count2 = 0, count3 = 0;
4062
4063 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4064 PyErr_BadArgument();
4065 return NULL;
4066 }
4067 decode = PyUnicode_AS_UNICODE(string);
4068 memset(level1, 0xFF, sizeof level1);
4069 memset(level2, 0xFF, sizeof level2);
4070
4071 /* If there isn't a one-to-one mapping of NULL to \0,
4072 or if there are non-BMP characters, we need to use
4073 a mapping dictionary. */
4074 if (decode[0] != 0)
4075 need_dict = 1;
4076 for (i = 1; i < 256; i++) {
4077 int l1, l2;
4078 if (decode[i] == 0
4079 #ifdef Py_UNICODE_WIDE
4080 || decode[i] > 0xFFFF
4081 #endif
4082 ) {
4083 need_dict = 1;
4084 break;
4085 }
4086 if (decode[i] == 0xFFFE)
4087 /* unmapped character */
4088 continue;
4089 l1 = decode[i] >> 11;
4090 l2 = decode[i] >> 7;
4091 if (level1[l1] == 0xFF)
4092 level1[l1] = count2++;
4093 if (level2[l2] == 0xFF)
4094 level2[l2] = count3++;
4095 }
4096
4097 if (count2 >= 0xFF || count3 >= 0xFF)
4098 need_dict = 1;
4099
4100 if (need_dict) {
4101 PyObject *result = PyDict_New();
4102 PyObject *key, *value;
4103 if (!result)
4104 return NULL;
4105 for (i = 0; i < 256; i++) {
4106 key = value = NULL;
4107 key = PyInt_FromLong(decode[i]);
4108 value = PyInt_FromLong(i);
4109 if (!key || !value)
4110 goto failed1;
4111 if (PyDict_SetItem(result, key, value) == -1)
4112 goto failed1;
4113 Py_DECREF(key);
4114 Py_DECREF(value);
4115 }
4116 return result;
4117 failed1:
4118 Py_XDECREF(key);
4119 Py_XDECREF(value);
4120 Py_DECREF(result);
4121 return NULL;
4122 }
4123
4124 /* Create a three-level trie */
4125 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4126 16*count2 + 128*count3 - 1);
4127 if (!result)
4128 return PyErr_NoMemory();
4129 PyObject_Init(result, &EncodingMapType);
4130 mresult = (struct encoding_map*)result;
4131 mresult->count2 = count2;
4132 mresult->count3 = count3;
4133 mlevel1 = mresult->level1;
4134 mlevel2 = mresult->level23;
4135 mlevel3 = mresult->level23 + 16*count2;
4136 memcpy(mlevel1, level1, 32);
4137 memset(mlevel2, 0xFF, 16*count2);
4138 memset(mlevel3, 0, 128*count3);
4139 count3 = 0;
4140 for (i = 1; i < 256; i++) {
4141 int o1, o2, o3, i2, i3;
4142 if (decode[i] == 0xFFFE)
4143 /* unmapped character */
4144 continue;
4145 o1 = decode[i]>>11;
4146 o2 = (decode[i]>>7) & 0xF;
4147 i2 = 16*mlevel1[o1] + o2;
4148 if (mlevel2[i2] == 0xFF)
4149 mlevel2[i2] = count3++;
4150 o3 = decode[i] & 0x7F;
4151 i3 = 128*mlevel2[i2] + o3;
4152 mlevel3[i3] = i;
4153 }
4154 return result;
4155}
4156
4157static int
4158encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4159{
4160 struct encoding_map *map = (struct encoding_map*)mapping;
4161 int l1 = c>>11;
4162 int l2 = (c>>7) & 0xF;
4163 int l3 = c & 0x7F;
4164 int i;
4165
4166#ifdef Py_UNICODE_WIDE
4167 if (c > 0xFFFF) {
4168 return -1;
4169 }
4170#endif
4171 if (c == 0)
4172 return 0;
4173 /* level 1*/
4174 i = map->level1[l1];
4175 if (i == 0xFF) {
4176 return -1;
4177 }
4178 /* level 2*/
4179 i = map->level23[16*i+l2];
4180 if (i == 0xFF) {
4181 return -1;
4182 }
4183 /* level 3 */
4184 i = map->level23[16*map->count2 + 128*i + l3];
4185 if (i == 0) {
4186 return -1;
4187 }
4188 return i;
4189}
4190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191/* Lookup the character ch in the mapping. If the character
4192 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004193 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 PyObject *w = PyInt_FromLong((long)c);
4197 PyObject *x;
4198
4199 if (w == NULL)
4200 return NULL;
4201 x = PyObject_GetItem(mapping, w);
4202 Py_DECREF(w);
4203 if (x == NULL) {
4204 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4205 /* No mapping found means: mapping is undefined. */
4206 PyErr_Clear();
4207 x = Py_None;
4208 Py_INCREF(x);
4209 return x;
4210 } else
4211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004213 else if (x == Py_None)
4214 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 else if (PyInt_Check(x)) {
4216 long value = PyInt_AS_LONG(x);
4217 if (value < 0 || value > 255) {
4218 PyErr_SetString(PyExc_TypeError,
4219 "character mapping must be in range(256)");
4220 Py_DECREF(x);
4221 return NULL;
4222 }
4223 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 else if (PyString_Check(x))
4226 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004229 PyErr_Format(PyExc_TypeError,
4230 "character mapping must return integer, None or str8, not %.400s",
4231 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 Py_DECREF(x);
4233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234 }
4235}
4236
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004237static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004238charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004239{
Walter Dörwald827b0552007-05-12 13:23:53 +00004240 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004241 /* exponentially overallocate to minimize reallocations */
4242 if (requiredsize < 2*outsize)
4243 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004244 if (PyBytes_Resize(outobj, requiredsize)) {
4245 Py_DECREF(outobj);
4246 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004247 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004248 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004249}
4250
4251typedef enum charmapencode_result {
4252 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4253}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004255 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 space is available. Return a new reference to the object that
4257 was put in the output buffer, or Py_None, if the mapping was undefined
4258 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004259 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004261charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004262 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004264 PyObject *rep;
4265 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004266 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004268 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004269 int res = encoding_map_lookup(c, mapping);
4270 Py_ssize_t requiredsize = *outpos+1;
4271 if (res == -1)
4272 return enc_FAILED;
4273 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004274 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004275 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004276 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004277 outstart[(*outpos)++] = (char)res;
4278 return enc_SUCCESS;
4279 }
4280
4281 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004283 return enc_EXCEPTION;
4284 else if (rep==Py_None) {
4285 Py_DECREF(rep);
4286 return enc_FAILED;
4287 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004289 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004290 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004291 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004293 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004295 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4297 }
4298 else {
4299 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004300 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4301 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004302 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004303 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004305 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004307 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308 memcpy(outstart + *outpos, repchars, repsize);
4309 *outpos += repsize;
4310 }
4311 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004312 Py_DECREF(rep);
4313 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314}
4315
4316/* handle an error in PyUnicode_EncodeCharmap
4317 Return 0 on success, -1 on error */
4318static
4319int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004320 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004322 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004323 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004324{
4325 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004326 Py_ssize_t repsize;
4327 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 Py_UNICODE *uni2;
4329 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004330 Py_ssize_t collstartpos = *inpos;
4331 Py_ssize_t collendpos = *inpos+1;
4332 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 char *encoding = "charmap";
4334 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004335 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004337 /* find all unencodable characters */
4338 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004339 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004340 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341 int res = encoding_map_lookup(p[collendpos], mapping);
4342 if (res != -1)
4343 break;
4344 ++collendpos;
4345 continue;
4346 }
4347
4348 rep = charmapencode_lookup(p[collendpos], mapping);
4349 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004351 else if (rep!=Py_None) {
4352 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 break;
4354 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004355 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 ++collendpos;
4357 }
4358 /* cache callback name lookup
4359 * (if not done yet, i.e. it's the first error) */
4360 if (*known_errorHandler==-1) {
4361 if ((errors==NULL) || (!strcmp(errors, "strict")))
4362 *known_errorHandler = 1;
4363 else if (!strcmp(errors, "replace"))
4364 *known_errorHandler = 2;
4365 else if (!strcmp(errors, "ignore"))
4366 *known_errorHandler = 3;
4367 else if (!strcmp(errors, "xmlcharrefreplace"))
4368 *known_errorHandler = 4;
4369 else
4370 *known_errorHandler = 0;
4371 }
4372 switch (*known_errorHandler) {
4373 case 1: /* strict */
4374 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4375 return -1;
4376 case 2: /* replace */
4377 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4378 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004379 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 return -1;
4381 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004382 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4384 return -1;
4385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 }
4387 /* fall through */
4388 case 3: /* ignore */
4389 *inpos = collendpos;
4390 break;
4391 case 4: /* xmlcharrefreplace */
4392 /* generate replacement (temporarily (mis)uses p) */
4393 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4394 char buffer[2+29+1+1];
4395 char *cp;
4396 sprintf(buffer, "&#%d;", (int)p[collpos]);
4397 for (cp = buffer; *cp; ++cp) {
4398 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004399 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004401 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4403 return -1;
4404 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 }
4406 }
4407 *inpos = collendpos;
4408 break;
4409 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004410 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 encoding, reason, p, size, exceptionObject,
4412 collstartpos, collendpos, &newpos);
4413 if (repunicode == NULL)
4414 return -1;
4415 /* generate replacement */
4416 repsize = PyUnicode_GET_SIZE(repunicode);
4417 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4418 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004419 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 return -1;
4421 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004422 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4425 return -1;
4426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 }
4428 *inpos = newpos;
4429 Py_DECREF(repunicode);
4430 }
4431 return 0;
4432}
4433
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004435 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 PyObject *mapping,
4437 const char *errors)
4438{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 /* output object */
4440 PyObject *res = NULL;
4441 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 PyObject *errorHandler = NULL;
4446 PyObject *exc = NULL;
4447 /* the following variable is used for caching string comparisons
4448 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4449 * 3=ignore, 4=xmlcharrefreplace */
4450 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451
4452 /* Default to Latin-1 */
4453 if (mapping == NULL)
4454 return PyUnicode_EncodeLatin1(p, size, errors);
4455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 /* allocate enough for a simple encoding without
4457 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004458 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 if (res == NULL)
4460 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004461 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 while (inpos<size) {
4465 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004466 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004467 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004469 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 if (charmap_encoding_error(p, size, &inpos, mapping,
4471 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004472 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004473 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004474 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 else
4478 /* done with this character => adjust input position */
4479 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004483 if (respos<PyBytes_GET_SIZE(res)) {
4484 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 goto onError;
4486 }
4487 Py_XDECREF(exc);
4488 Py_XDECREF(errorHandler);
4489 return res;
4490
4491 onError:
4492 Py_XDECREF(res);
4493 Py_XDECREF(exc);
4494 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 return NULL;
4496}
4497
4498PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4499 PyObject *mapping)
4500{
4501 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4502 PyErr_BadArgument();
4503 return NULL;
4504 }
4505 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4506 PyUnicode_GET_SIZE(unicode),
4507 mapping,
4508 NULL);
4509}
4510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511/* create or adjust a UnicodeTranslateError */
4512static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004513 const Py_UNICODE *unicode, Py_ssize_t size,
4514 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 if (*exceptionObject == NULL) {
4518 *exceptionObject = PyUnicodeTranslateError_Create(
4519 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 }
4521 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4523 goto onError;
4524 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4525 goto onError;
4526 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4527 goto onError;
4528 return;
4529 onError:
4530 Py_DECREF(*exceptionObject);
4531 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532 }
4533}
4534
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535/* raises a UnicodeTranslateError */
4536static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004537 const Py_UNICODE *unicode, Py_ssize_t size,
4538 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 const char *reason)
4540{
4541 make_translate_exception(exceptionObject,
4542 unicode, size, startpos, endpos, reason);
4543 if (*exceptionObject != NULL)
4544 PyCodec_StrictErrors(*exceptionObject);
4545}
4546
4547/* error handling callback helper:
4548 build arguments, call the callback and check the arguments,
4549 put the result into newpos and return the replacement string, which
4550 has to be freed by the caller */
4551static PyObject *unicode_translate_call_errorhandler(const char *errors,
4552 PyObject **errorHandler,
4553 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4555 Py_ssize_t startpos, Py_ssize_t endpos,
4556 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004558 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004560 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 PyObject *restuple;
4562 PyObject *resunicode;
4563
4564 if (*errorHandler == NULL) {
4565 *errorHandler = PyCodec_LookupError(errors);
4566 if (*errorHandler == NULL)
4567 return NULL;
4568 }
4569
4570 make_translate_exception(exceptionObject,
4571 unicode, size, startpos, endpos, reason);
4572 if (*exceptionObject == NULL)
4573 return NULL;
4574
4575 restuple = PyObject_CallFunctionObjArgs(
4576 *errorHandler, *exceptionObject, NULL);
4577 if (restuple == NULL)
4578 return NULL;
4579 if (!PyTuple_Check(restuple)) {
4580 PyErr_Format(PyExc_TypeError, &argparse[4]);
4581 Py_DECREF(restuple);
4582 return NULL;
4583 }
4584 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004585 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 Py_DECREF(restuple);
4587 return NULL;
4588 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004589 if (i_newpos<0)
4590 *newpos = size+i_newpos;
4591 else
4592 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004593 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004594 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004595 Py_DECREF(restuple);
4596 return NULL;
4597 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 Py_INCREF(resunicode);
4599 Py_DECREF(restuple);
4600 return resunicode;
4601}
4602
4603/* Lookup the character ch in the mapping and put the result in result,
4604 which must be decrefed by the caller.
4605 Return 0 on success, -1 on error */
4606static
4607int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4608{
4609 PyObject *w = PyInt_FromLong((long)c);
4610 PyObject *x;
4611
4612 if (w == NULL)
4613 return -1;
4614 x = PyObject_GetItem(mapping, w);
4615 Py_DECREF(w);
4616 if (x == NULL) {
4617 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4618 /* No mapping found means: use 1:1 mapping. */
4619 PyErr_Clear();
4620 *result = NULL;
4621 return 0;
4622 } else
4623 return -1;
4624 }
4625 else if (x == Py_None) {
4626 *result = x;
4627 return 0;
4628 }
4629 else if (PyInt_Check(x)) {
4630 long value = PyInt_AS_LONG(x);
4631 long max = PyUnicode_GetMax();
4632 if (value < 0 || value > max) {
4633 PyErr_Format(PyExc_TypeError,
4634 "character mapping must be in range(0x%lx)", max+1);
4635 Py_DECREF(x);
4636 return -1;
4637 }
4638 *result = x;
4639 return 0;
4640 }
4641 else if (PyUnicode_Check(x)) {
4642 *result = x;
4643 return 0;
4644 }
4645 else {
4646 /* wrong return value */
4647 PyErr_SetString(PyExc_TypeError,
4648 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004649 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 return -1;
4651 }
4652}
4653/* ensure that *outobj is at least requiredsize characters long,
4654if not reallocate and adjust various state variables.
4655Return 0 on success, -1 on error */
4656static
Walter Dörwald4894c302003-10-24 14:25:28 +00004657int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004661 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004663 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004665 if (requiredsize < 2 * oldsize)
4666 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004667 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 return -1;
4669 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 }
4671 return 0;
4672}
4673/* lookup the character, put the result in the output string and adjust
4674 various state variables. Return a new reference to the object that
4675 was put in the output buffer in *result, or Py_None, if the mapping was
4676 undefined (in which case no character was written).
4677 The called must decref result.
4678 Return 0 on success, -1 on error. */
4679static
Walter Dörwald4894c302003-10-24 14:25:28 +00004680int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004681 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004682 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683{
Walter Dörwald4894c302003-10-24 14:25:28 +00004684 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 return -1;
4686 if (*res==NULL) {
4687 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004688 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 }
4690 else if (*res==Py_None)
4691 ;
4692 else if (PyInt_Check(*res)) {
4693 /* no overflow check, because we know that the space is enough */
4694 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4695 }
4696 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 if (repsize==1) {
4699 /* no overflow check, because we know that the space is enough */
4700 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4701 }
4702 else if (repsize!=0) {
4703 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004704 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004705 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004706 repsize - 1;
4707 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 return -1;
4709 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4710 *outp += repsize;
4711 }
4712 }
4713 else
4714 return -1;
4715 return 0;
4716}
4717
4718PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004719 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 PyObject *mapping,
4721 const char *errors)
4722{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 /* output object */
4724 PyObject *res = NULL;
4725 /* pointers to the beginning and end+1 of input */
4726 const Py_UNICODE *startp = p;
4727 const Py_UNICODE *endp = p + size;
4728 /* pointer into the output */
4729 Py_UNICODE *str;
4730 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004731 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 char *reason = "character maps to <undefined>";
4733 PyObject *errorHandler = NULL;
4734 PyObject *exc = NULL;
4735 /* the following variable is used for caching string comparisons
4736 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4737 * 3=ignore, 4=xmlcharrefreplace */
4738 int known_errorHandler = -1;
4739
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 if (mapping == NULL) {
4741 PyErr_BadArgument();
4742 return NULL;
4743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744
4745 /* allocate enough for a simple 1:1 translation without
4746 replacements, if we need more, we'll resize */
4747 res = PyUnicode_FromUnicode(NULL, size);
4748 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004749 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 return res;
4752 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 while (p<endp) {
4755 /* try to encode it */
4756 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004757 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 goto onError;
4760 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004761 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 if (x!=Py_None) /* it worked => adjust input pointer */
4763 ++p;
4764 else { /* untranslatable character */
4765 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 Py_ssize_t repsize;
4767 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 Py_UNICODE *uni2;
4769 /* startpos for collecting untranslatable chars */
4770 const Py_UNICODE *collstart = p;
4771 const Py_UNICODE *collend = p+1;
4772 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 /* find all untranslatable characters */
4775 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004776 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 goto onError;
4778 Py_XDECREF(x);
4779 if (x!=Py_None)
4780 break;
4781 ++collend;
4782 }
4783 /* cache callback name lookup
4784 * (if not done yet, i.e. it's the first error) */
4785 if (known_errorHandler==-1) {
4786 if ((errors==NULL) || (!strcmp(errors, "strict")))
4787 known_errorHandler = 1;
4788 else if (!strcmp(errors, "replace"))
4789 known_errorHandler = 2;
4790 else if (!strcmp(errors, "ignore"))
4791 known_errorHandler = 3;
4792 else if (!strcmp(errors, "xmlcharrefreplace"))
4793 known_errorHandler = 4;
4794 else
4795 known_errorHandler = 0;
4796 }
4797 switch (known_errorHandler) {
4798 case 1: /* strict */
4799 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4800 goto onError;
4801 case 2: /* replace */
4802 /* No need to check for space, this is a 1:1 replacement */
4803 for (coll = collstart; coll<collend; ++coll)
4804 *str++ = '?';
4805 /* fall through */
4806 case 3: /* ignore */
4807 p = collend;
4808 break;
4809 case 4: /* xmlcharrefreplace */
4810 /* generate replacement (temporarily (mis)uses p) */
4811 for (p = collstart; p < collend; ++p) {
4812 char buffer[2+29+1+1];
4813 char *cp;
4814 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004815 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4817 goto onError;
4818 for (cp = buffer; *cp; ++cp)
4819 *str++ = *cp;
4820 }
4821 p = collend;
4822 break;
4823 default:
4824 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4825 reason, startp, size, &exc,
4826 collstart-startp, collend-startp, &newpos);
4827 if (repunicode == NULL)
4828 goto onError;
4829 /* generate replacement */
4830 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004831 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4833 Py_DECREF(repunicode);
4834 goto onError;
4835 }
4836 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4837 *str++ = *uni2;
4838 p = startp + newpos;
4839 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 }
4841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 /* Resize if we allocated to much */
4844 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004845 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004846 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004847 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 }
4849 Py_XDECREF(exc);
4850 Py_XDECREF(errorHandler);
4851 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 onError:
4854 Py_XDECREF(res);
4855 Py_XDECREF(exc);
4856 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 return NULL;
4858}
4859
4860PyObject *PyUnicode_Translate(PyObject *str,
4861 PyObject *mapping,
4862 const char *errors)
4863{
4864 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004865
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 str = PyUnicode_FromObject(str);
4867 if (str == NULL)
4868 goto onError;
4869 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4870 PyUnicode_GET_SIZE(str),
4871 mapping,
4872 errors);
4873 Py_DECREF(str);
4874 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004875
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 onError:
4877 Py_XDECREF(str);
4878 return NULL;
4879}
Tim Petersced69f82003-09-16 20:30:58 +00004880
Guido van Rossum9e896b32000-04-05 20:11:21 +00004881/* --- Decimal Encoder ---------------------------------------------------- */
4882
4883int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004884 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004885 char *output,
4886 const char *errors)
4887{
4888 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 PyObject *errorHandler = NULL;
4890 PyObject *exc = NULL;
4891 const char *encoding = "decimal";
4892 const char *reason = "invalid decimal Unicode string";
4893 /* the following variable is used for caching string comparisons
4894 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4895 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004896
4897 if (output == NULL) {
4898 PyErr_BadArgument();
4899 return -1;
4900 }
4901
4902 p = s;
4903 end = s + length;
4904 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004906 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004908 Py_ssize_t repsize;
4909 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 Py_UNICODE *uni2;
4911 Py_UNICODE *collstart;
4912 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004913
Guido van Rossum9e896b32000-04-05 20:11:21 +00004914 if (Py_UNICODE_ISSPACE(ch)) {
4915 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004917 continue;
4918 }
4919 decimal = Py_UNICODE_TODECIMAL(ch);
4920 if (decimal >= 0) {
4921 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004923 continue;
4924 }
Guido van Rossumba477042000-04-06 18:18:10 +00004925 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004926 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004928 continue;
4929 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 /* All other characters are considered unencodable */
4931 collstart = p;
4932 collend = p+1;
4933 while (collend < end) {
4934 if ((0 < *collend && *collend < 256) ||
4935 !Py_UNICODE_ISSPACE(*collend) ||
4936 Py_UNICODE_TODECIMAL(*collend))
4937 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004938 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939 /* cache callback name lookup
4940 * (if not done yet, i.e. it's the first error) */
4941 if (known_errorHandler==-1) {
4942 if ((errors==NULL) || (!strcmp(errors, "strict")))
4943 known_errorHandler = 1;
4944 else if (!strcmp(errors, "replace"))
4945 known_errorHandler = 2;
4946 else if (!strcmp(errors, "ignore"))
4947 known_errorHandler = 3;
4948 else if (!strcmp(errors, "xmlcharrefreplace"))
4949 known_errorHandler = 4;
4950 else
4951 known_errorHandler = 0;
4952 }
4953 switch (known_errorHandler) {
4954 case 1: /* strict */
4955 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4956 goto onError;
4957 case 2: /* replace */
4958 for (p = collstart; p < collend; ++p)
4959 *output++ = '?';
4960 /* fall through */
4961 case 3: /* ignore */
4962 p = collend;
4963 break;
4964 case 4: /* xmlcharrefreplace */
4965 /* generate replacement (temporarily (mis)uses p) */
4966 for (p = collstart; p < collend; ++p)
4967 output += sprintf(output, "&#%d;", (int)*p);
4968 p = collend;
4969 break;
4970 default:
4971 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4972 encoding, reason, s, length, &exc,
4973 collstart-s, collend-s, &newpos);
4974 if (repunicode == NULL)
4975 goto onError;
4976 /* generate replacement */
4977 repsize = PyUnicode_GET_SIZE(repunicode);
4978 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4979 Py_UNICODE ch = *uni2;
4980 if (Py_UNICODE_ISSPACE(ch))
4981 *output++ = ' ';
4982 else {
4983 decimal = Py_UNICODE_TODECIMAL(ch);
4984 if (decimal >= 0)
4985 *output++ = '0' + decimal;
4986 else if (0 < ch && ch < 256)
4987 *output++ = (char)ch;
4988 else {
4989 Py_DECREF(repunicode);
4990 raise_encode_exception(&exc, encoding,
4991 s, length, collstart-s, collend-s, reason);
4992 goto onError;
4993 }
4994 }
4995 }
4996 p = s + newpos;
4997 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998 }
4999 }
5000 /* 0-terminate the output string */
5001 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 Py_XDECREF(exc);
5003 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005004 return 0;
5005
5006 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 Py_XDECREF(exc);
5008 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005009 return -1;
5010}
5011
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012/* --- Helpers ------------------------------------------------------------ */
5013
Eric Smith8c663262007-08-25 02:26:07 +00005014#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005015
5016#include "stringlib/fastsearch.h"
5017
5018#include "stringlib/count.h"
5019#include "stringlib/find.h"
5020#include "stringlib/partition.h"
5021
5022/* helper macro to fixup start/end slice values */
5023#define FIX_START_END(obj) \
5024 if (start < 0) \
5025 start += (obj)->length; \
5026 if (start < 0) \
5027 start = 0; \
5028 if (end > (obj)->length) \
5029 end = (obj)->length; \
5030 if (end < 0) \
5031 end += (obj)->length; \
5032 if (end < 0) \
5033 end = 0;
5034
Martin v. Löwis18e16552006-02-15 17:27:45 +00005035Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005036 PyObject *substr,
5037 Py_ssize_t start,
5038 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005040 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005041 PyUnicodeObject* str_obj;
5042 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005043
Thomas Wouters477c8d52006-05-27 19:21:47 +00005044 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5045 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005047 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5048 if (!sub_obj) {
5049 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 return -1;
5051 }
Tim Petersced69f82003-09-16 20:30:58 +00005052
Thomas Wouters477c8d52006-05-27 19:21:47 +00005053 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005054
Thomas Wouters477c8d52006-05-27 19:21:47 +00005055 result = stringlib_count(
5056 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5057 );
5058
5059 Py_DECREF(sub_obj);
5060 Py_DECREF(str_obj);
5061
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 return result;
5063}
5064
Martin v. Löwis18e16552006-02-15 17:27:45 +00005065Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005066 PyObject *sub,
5067 Py_ssize_t start,
5068 Py_ssize_t end,
5069 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005071 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005072
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005074 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005075 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005076 sub = PyUnicode_FromObject(sub);
5077 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005078 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005079 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 }
Tim Petersced69f82003-09-16 20:30:58 +00005081
Thomas Wouters477c8d52006-05-27 19:21:47 +00005082 if (direction > 0)
5083 result = stringlib_find_slice(
5084 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5085 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5086 start, end
5087 );
5088 else
5089 result = stringlib_rfind_slice(
5090 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5091 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5092 start, end
5093 );
5094
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005096 Py_DECREF(sub);
5097
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 return result;
5099}
5100
Tim Petersced69f82003-09-16 20:30:58 +00005101static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102int tailmatch(PyUnicodeObject *self,
5103 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005104 Py_ssize_t start,
5105 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 int direction)
5107{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 if (substring->length == 0)
5109 return 1;
5110
Thomas Wouters477c8d52006-05-27 19:21:47 +00005111 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112
5113 end -= substring->length;
5114 if (end < start)
5115 return 0;
5116
5117 if (direction > 0) {
5118 if (Py_UNICODE_MATCH(self, end, substring))
5119 return 1;
5120 } else {
5121 if (Py_UNICODE_MATCH(self, start, substring))
5122 return 1;
5123 }
5124
5125 return 0;
5126}
5127
Martin v. Löwis18e16552006-02-15 17:27:45 +00005128Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005130 Py_ssize_t start,
5131 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 int direction)
5133{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005134 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005135
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 str = PyUnicode_FromObject(str);
5137 if (str == NULL)
5138 return -1;
5139 substr = PyUnicode_FromObject(substr);
5140 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005141 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 return -1;
5143 }
Tim Petersced69f82003-09-16 20:30:58 +00005144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 result = tailmatch((PyUnicodeObject *)str,
5146 (PyUnicodeObject *)substr,
5147 start, end, direction);
5148 Py_DECREF(str);
5149 Py_DECREF(substr);
5150 return result;
5151}
5152
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153/* Apply fixfct filter to the Unicode object self and return a
5154 reference to the modified object */
5155
Tim Petersced69f82003-09-16 20:30:58 +00005156static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157PyObject *fixup(PyUnicodeObject *self,
5158 int (*fixfct)(PyUnicodeObject *s))
5159{
5160
5161 PyUnicodeObject *u;
5162
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005163 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 if (u == NULL)
5165 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005166
5167 Py_UNICODE_COPY(u->str, self->str, self->length);
5168
Tim Peters7a29bd52001-09-12 03:03:31 +00005169 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 /* fixfct should return TRUE if it modified the buffer. If
5171 FALSE, return a reference to the original buffer instead
5172 (to save space, not time) */
5173 Py_INCREF(self);
5174 Py_DECREF(u);
5175 return (PyObject*) self;
5176 }
5177 return (PyObject*) u;
5178}
5179
Tim Petersced69f82003-09-16 20:30:58 +00005180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181int fixupper(PyUnicodeObject *self)
5182{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005183 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 Py_UNICODE *s = self->str;
5185 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 while (len-- > 0) {
5188 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005189
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 ch = Py_UNICODE_TOUPPER(*s);
5191 if (ch != *s) {
5192 status = 1;
5193 *s = ch;
5194 }
5195 s++;
5196 }
5197
5198 return status;
5199}
5200
Tim Petersced69f82003-09-16 20:30:58 +00005201static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202int fixlower(PyUnicodeObject *self)
5203{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 Py_UNICODE *s = self->str;
5206 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005207
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 while (len-- > 0) {
5209 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005210
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 ch = Py_UNICODE_TOLOWER(*s);
5212 if (ch != *s) {
5213 status = 1;
5214 *s = ch;
5215 }
5216 s++;
5217 }
5218
5219 return status;
5220}
5221
Tim Petersced69f82003-09-16 20:30:58 +00005222static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223int fixswapcase(PyUnicodeObject *self)
5224{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005225 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 Py_UNICODE *s = self->str;
5227 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005228
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 while (len-- > 0) {
5230 if (Py_UNICODE_ISUPPER(*s)) {
5231 *s = Py_UNICODE_TOLOWER(*s);
5232 status = 1;
5233 } else if (Py_UNICODE_ISLOWER(*s)) {
5234 *s = Py_UNICODE_TOUPPER(*s);
5235 status = 1;
5236 }
5237 s++;
5238 }
5239
5240 return status;
5241}
5242
Tim Petersced69f82003-09-16 20:30:58 +00005243static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244int fixcapitalize(PyUnicodeObject *self)
5245{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005246 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005247 Py_UNICODE *s = self->str;
5248 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005250 if (len == 0)
5251 return 0;
5252 if (Py_UNICODE_ISLOWER(*s)) {
5253 *s = Py_UNICODE_TOUPPER(*s);
5254 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005256 s++;
5257 while (--len > 0) {
5258 if (Py_UNICODE_ISUPPER(*s)) {
5259 *s = Py_UNICODE_TOLOWER(*s);
5260 status = 1;
5261 }
5262 s++;
5263 }
5264 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265}
5266
5267static
5268int fixtitle(PyUnicodeObject *self)
5269{
5270 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5271 register Py_UNICODE *e;
5272 int previous_is_cased;
5273
5274 /* Shortcut for single character strings */
5275 if (PyUnicode_GET_SIZE(self) == 1) {
5276 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5277 if (*p != ch) {
5278 *p = ch;
5279 return 1;
5280 }
5281 else
5282 return 0;
5283 }
Tim Petersced69f82003-09-16 20:30:58 +00005284
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 e = p + PyUnicode_GET_SIZE(self);
5286 previous_is_cased = 0;
5287 for (; p < e; p++) {
5288 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005289
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 if (previous_is_cased)
5291 *p = Py_UNICODE_TOLOWER(ch);
5292 else
5293 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005294
5295 if (Py_UNICODE_ISLOWER(ch) ||
5296 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 Py_UNICODE_ISTITLE(ch))
5298 previous_is_cased = 1;
5299 else
5300 previous_is_cased = 0;
5301 }
5302 return 1;
5303}
5304
Tim Peters8ce9f162004-08-27 01:49:32 +00005305PyObject *
5306PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307{
Tim Peters8ce9f162004-08-27 01:49:32 +00005308 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005309 const Py_UNICODE blank = ' ';
5310 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005311 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005312 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005313 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5314 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005315 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5316 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005317 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005318 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005319 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320
Tim Peters05eba1f2004-08-27 21:32:02 +00005321 fseq = PySequence_Fast(seq, "");
5322 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005323 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005324 }
5325
Tim Peters91879ab2004-08-27 22:35:44 +00005326 /* Grrrr. A codec may be invoked to convert str objects to
5327 * Unicode, and so it's possible to call back into Python code
5328 * during PyUnicode_FromObject(), and so it's possible for a sick
5329 * codec to change the size of fseq (if seq is a list). Therefore
5330 * we have to keep refetching the size -- can't assume seqlen
5331 * is invariant.
5332 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005333 seqlen = PySequence_Fast_GET_SIZE(fseq);
5334 /* If empty sequence, return u"". */
5335 if (seqlen == 0) {
5336 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5337 goto Done;
5338 }
5339 /* If singleton sequence with an exact Unicode, return that. */
5340 if (seqlen == 1) {
5341 item = PySequence_Fast_GET_ITEM(fseq, 0);
5342 if (PyUnicode_CheckExact(item)) {
5343 Py_INCREF(item);
5344 res = (PyUnicodeObject *)item;
5345 goto Done;
5346 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005347 }
5348
Tim Peters05eba1f2004-08-27 21:32:02 +00005349 /* At least two items to join, or one that isn't exact Unicode. */
5350 if (seqlen > 1) {
5351 /* Set up sep and seplen -- they're needed. */
5352 if (separator == NULL) {
5353 sep = &blank;
5354 seplen = 1;
5355 }
5356 else {
5357 internal_separator = PyUnicode_FromObject(separator);
5358 if (internal_separator == NULL)
5359 goto onError;
5360 sep = PyUnicode_AS_UNICODE(internal_separator);
5361 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005362 /* In case PyUnicode_FromObject() mutated seq. */
5363 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005364 }
5365 }
5366
5367 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005368 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005369 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005370 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005371 res_p = PyUnicode_AS_UNICODE(res);
5372 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005373
Tim Peters05eba1f2004-08-27 21:32:02 +00005374 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005375 Py_ssize_t itemlen;
5376 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005377
5378 item = PySequence_Fast_GET_ITEM(fseq, i);
5379 /* Convert item to Unicode. */
5380 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5381 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005382 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005383 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005384 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005385 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005386 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005387 item = PyUnicode_FromObject(item);
5388 if (item == NULL)
5389 goto onError;
5390 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005391
Tim Peters91879ab2004-08-27 22:35:44 +00005392 /* In case PyUnicode_FromObject() mutated seq. */
5393 seqlen = PySequence_Fast_GET_SIZE(fseq);
5394
Tim Peters8ce9f162004-08-27 01:49:32 +00005395 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005397 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005398 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005399 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005400 if (i < seqlen - 1) {
5401 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005402 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005403 goto Overflow;
5404 }
5405 if (new_res_used > res_alloc) {
5406 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005407 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005408 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005409 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005410 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005411 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005412 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005413 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005415 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005416 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005418
5419 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005420 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005421 res_p += itemlen;
5422 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005423 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005424 res_p += seplen;
5425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005427 res_used = new_res_used;
5428 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005429
Tim Peters05eba1f2004-08-27 21:32:02 +00005430 /* Shrink res to match the used area; this probably can't fail,
5431 * but it's cheap to check.
5432 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005433 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005434 goto onError;
5435
5436 Done:
5437 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005438 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 return (PyObject *)res;
5440
Tim Peters8ce9f162004-08-27 01:49:32 +00005441 Overflow:
5442 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005443 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005444 Py_DECREF(item);
5445 /* fall through */
5446
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005448 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005449 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005450 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 return NULL;
5452}
5453
Tim Petersced69f82003-09-16 20:30:58 +00005454static
5455PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005456 Py_ssize_t left,
5457 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 Py_UNICODE fill)
5459{
5460 PyUnicodeObject *u;
5461
5462 if (left < 0)
5463 left = 0;
5464 if (right < 0)
5465 right = 0;
5466
Tim Peters7a29bd52001-09-12 03:03:31 +00005467 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 Py_INCREF(self);
5469 return self;
5470 }
5471
5472 u = _PyUnicode_New(left + self->length + right);
5473 if (u) {
5474 if (left)
5475 Py_UNICODE_FILL(u->str, fill, left);
5476 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5477 if (right)
5478 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5479 }
5480
5481 return u;
5482}
5483
5484#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005485 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 if (!str) \
5487 goto onError; \
5488 if (PyList_Append(list, str)) { \
5489 Py_DECREF(str); \
5490 goto onError; \
5491 } \
5492 else \
5493 Py_DECREF(str);
5494
5495static
5496PyObject *split_whitespace(PyUnicodeObject *self,
5497 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005498 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005500 register Py_ssize_t i;
5501 register Py_ssize_t j;
5502 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 PyObject *str;
5504
5505 for (i = j = 0; i < len; ) {
5506 /* find a token */
5507 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5508 i++;
5509 j = i;
5510 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5511 i++;
5512 if (j < i) {
5513 if (maxcount-- <= 0)
5514 break;
5515 SPLIT_APPEND(self->str, j, i);
5516 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5517 i++;
5518 j = i;
5519 }
5520 }
5521 if (j < len) {
5522 SPLIT_APPEND(self->str, j, len);
5523 }
5524 return list;
5525
5526 onError:
5527 Py_DECREF(list);
5528 return NULL;
5529}
5530
5531PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005532 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005534 register Py_ssize_t i;
5535 register Py_ssize_t j;
5536 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 PyObject *list;
5538 PyObject *str;
5539 Py_UNICODE *data;
5540
5541 string = PyUnicode_FromObject(string);
5542 if (string == NULL)
5543 return NULL;
5544 data = PyUnicode_AS_UNICODE(string);
5545 len = PyUnicode_GET_SIZE(string);
5546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 list = PyList_New(0);
5548 if (!list)
5549 goto onError;
5550
5551 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005555 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
5558 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005559 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 if (i < len) {
5561 if (data[i] == '\r' && i + 1 < len &&
5562 data[i+1] == '\n')
5563 i += 2;
5564 else
5565 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005566 if (keepends)
5567 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 }
Guido van Rossum86662912000-04-11 15:38:46 +00005569 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 j = i;
5571 }
5572 if (j < len) {
5573 SPLIT_APPEND(data, j, len);
5574 }
5575
5576 Py_DECREF(string);
5577 return list;
5578
5579 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005580 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 Py_DECREF(string);
5582 return NULL;
5583}
5584
Tim Petersced69f82003-09-16 20:30:58 +00005585static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586PyObject *split_char(PyUnicodeObject *self,
5587 PyObject *list,
5588 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005589 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005591 register Py_ssize_t i;
5592 register Py_ssize_t j;
5593 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 PyObject *str;
5595
5596 for (i = j = 0; i < len; ) {
5597 if (self->str[i] == ch) {
5598 if (maxcount-- <= 0)
5599 break;
5600 SPLIT_APPEND(self->str, j, i);
5601 i = j = i + 1;
5602 } else
5603 i++;
5604 }
5605 if (j <= len) {
5606 SPLIT_APPEND(self->str, j, len);
5607 }
5608 return list;
5609
5610 onError:
5611 Py_DECREF(list);
5612 return NULL;
5613}
5614
Tim Petersced69f82003-09-16 20:30:58 +00005615static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616PyObject *split_substring(PyUnicodeObject *self,
5617 PyObject *list,
5618 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005619 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005621 register Py_ssize_t i;
5622 register Py_ssize_t j;
5623 Py_ssize_t len = self->length;
5624 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 PyObject *str;
5626
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005627 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 if (Py_UNICODE_MATCH(self, i, substring)) {
5629 if (maxcount-- <= 0)
5630 break;
5631 SPLIT_APPEND(self->str, j, i);
5632 i = j = i + sublen;
5633 } else
5634 i++;
5635 }
5636 if (j <= len) {
5637 SPLIT_APPEND(self->str, j, len);
5638 }
5639 return list;
5640
5641 onError:
5642 Py_DECREF(list);
5643 return NULL;
5644}
5645
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005646static
5647PyObject *rsplit_whitespace(PyUnicodeObject *self,
5648 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005649 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005650{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005651 register Py_ssize_t i;
5652 register Py_ssize_t j;
5653 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005654 PyObject *str;
5655
5656 for (i = j = len - 1; i >= 0; ) {
5657 /* find a token */
5658 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5659 i--;
5660 j = i;
5661 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5662 i--;
5663 if (j > i) {
5664 if (maxcount-- <= 0)
5665 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005666 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005667 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5668 i--;
5669 j = i;
5670 }
5671 }
5672 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005673 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005674 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005675 if (PyList_Reverse(list) < 0)
5676 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005677 return list;
5678
5679 onError:
5680 Py_DECREF(list);
5681 return NULL;
5682}
5683
5684static
5685PyObject *rsplit_char(PyUnicodeObject *self,
5686 PyObject *list,
5687 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005688 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005689{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005690 register Py_ssize_t i;
5691 register Py_ssize_t j;
5692 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005693 PyObject *str;
5694
5695 for (i = j = len - 1; i >= 0; ) {
5696 if (self->str[i] == ch) {
5697 if (maxcount-- <= 0)
5698 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005699 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005700 j = i = i - 1;
5701 } else
5702 i--;
5703 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005704 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005705 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005706 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005707 if (PyList_Reverse(list) < 0)
5708 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005709 return list;
5710
5711 onError:
5712 Py_DECREF(list);
5713 return NULL;
5714}
5715
5716static
5717PyObject *rsplit_substring(PyUnicodeObject *self,
5718 PyObject *list,
5719 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005720 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005721{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005722 register Py_ssize_t i;
5723 register Py_ssize_t j;
5724 Py_ssize_t len = self->length;
5725 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005726 PyObject *str;
5727
5728 for (i = len - sublen, j = len; i >= 0; ) {
5729 if (Py_UNICODE_MATCH(self, i, substring)) {
5730 if (maxcount-- <= 0)
5731 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005732 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005733 j = i;
5734 i -= sublen;
5735 } else
5736 i--;
5737 }
5738 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005739 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005740 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005741 if (PyList_Reverse(list) < 0)
5742 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005743 return list;
5744
5745 onError:
5746 Py_DECREF(list);
5747 return NULL;
5748}
5749
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750#undef SPLIT_APPEND
5751
5752static
5753PyObject *split(PyUnicodeObject *self,
5754 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005755 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756{
5757 PyObject *list;
5758
5759 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005760 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
5762 list = PyList_New(0);
5763 if (!list)
5764 return NULL;
5765
5766 if (substring == NULL)
5767 return split_whitespace(self,list,maxcount);
5768
5769 else if (substring->length == 1)
5770 return split_char(self,list,substring->str[0],maxcount);
5771
5772 else if (substring->length == 0) {
5773 Py_DECREF(list);
5774 PyErr_SetString(PyExc_ValueError, "empty separator");
5775 return NULL;
5776 }
5777 else
5778 return split_substring(self,list,substring,maxcount);
5779}
5780
Tim Petersced69f82003-09-16 20:30:58 +00005781static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005782PyObject *rsplit(PyUnicodeObject *self,
5783 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005784 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005785{
5786 PyObject *list;
5787
5788 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005789 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790
5791 list = PyList_New(0);
5792 if (!list)
5793 return NULL;
5794
5795 if (substring == NULL)
5796 return rsplit_whitespace(self,list,maxcount);
5797
5798 else if (substring->length == 1)
5799 return rsplit_char(self,list,substring->str[0],maxcount);
5800
5801 else if (substring->length == 0) {
5802 Py_DECREF(list);
5803 PyErr_SetString(PyExc_ValueError, "empty separator");
5804 return NULL;
5805 }
5806 else
5807 return rsplit_substring(self,list,substring,maxcount);
5808}
5809
5810static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811PyObject *replace(PyUnicodeObject *self,
5812 PyUnicodeObject *str1,
5813 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815{
5816 PyUnicodeObject *u;
5817
5818 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005819 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820
Thomas Wouters477c8d52006-05-27 19:21:47 +00005821 if (str1->length == str2->length) {
5822 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005823 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005824 if (str1->length == 1) {
5825 /* replace characters */
5826 Py_UNICODE u1, u2;
5827 if (!findchar(self->str, self->length, str1->str[0]))
5828 goto nothing;
5829 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5830 if (!u)
5831 return NULL;
5832 Py_UNICODE_COPY(u->str, self->str, self->length);
5833 u1 = str1->str[0];
5834 u2 = str2->str[0];
5835 for (i = 0; i < u->length; i++)
5836 if (u->str[i] == u1) {
5837 if (--maxcount < 0)
5838 break;
5839 u->str[i] = u2;
5840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842 i = fastsearch(
5843 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005845 if (i < 0)
5846 goto nothing;
5847 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5848 if (!u)
5849 return NULL;
5850 Py_UNICODE_COPY(u->str, self->str, self->length);
5851 while (i <= self->length - str1->length)
5852 if (Py_UNICODE_MATCH(self, i, str1)) {
5853 if (--maxcount < 0)
5854 break;
5855 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5856 i += str1->length;
5857 } else
5858 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005861
5862 Py_ssize_t n, i, j, e;
5863 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 Py_UNICODE *p;
5865
5866 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005867 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 if (n > maxcount)
5869 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005870 if (n == 0)
5871 goto nothing;
5872 /* new_size = self->length + n * (str2->length - str1->length)); */
5873 delta = (str2->length - str1->length);
5874 if (delta == 0) {
5875 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005877 product = n * (str2->length - str1->length);
5878 if ((product / (str2->length - str1->length)) != n) {
5879 PyErr_SetString(PyExc_OverflowError,
5880 "replace string is too long");
5881 return NULL;
5882 }
5883 new_size = self->length + product;
5884 if (new_size < 0) {
5885 PyErr_SetString(PyExc_OverflowError,
5886 "replace string is too long");
5887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 }
5889 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005890 u = _PyUnicode_New(new_size);
5891 if (!u)
5892 return NULL;
5893 i = 0;
5894 p = u->str;
5895 e = self->length - str1->length;
5896 if (str1->length > 0) {
5897 while (n-- > 0) {
5898 /* look for next match */
5899 j = i;
5900 while (j <= e) {
5901 if (Py_UNICODE_MATCH(self, j, str1))
5902 break;
5903 j++;
5904 }
5905 if (j > i) {
5906 if (j > e)
5907 break;
5908 /* copy unchanged part [i:j] */
5909 Py_UNICODE_COPY(p, self->str+i, j-i);
5910 p += j - i;
5911 }
5912 /* copy substitution string */
5913 if (str2->length > 0) {
5914 Py_UNICODE_COPY(p, str2->str, str2->length);
5915 p += str2->length;
5916 }
5917 i = j + str1->length;
5918 }
5919 if (i < self->length)
5920 /* copy tail [i:] */
5921 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5922 } else {
5923 /* interleave */
5924 while (n > 0) {
5925 Py_UNICODE_COPY(p, str2->str, str2->length);
5926 p += str2->length;
5927 if (--n <= 0)
5928 break;
5929 *p++ = self->str[i++];
5930 }
5931 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005935
5936nothing:
5937 /* nothing to replace; return original string (when possible) */
5938 if (PyUnicode_CheckExact(self)) {
5939 Py_INCREF(self);
5940 return (PyObject *) self;
5941 }
5942 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943}
5944
5945/* --- Unicode Object Methods --------------------------------------------- */
5946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005947PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948"S.title() -> unicode\n\
5949\n\
5950Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
5953static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005954unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 return fixup(self, fixtitle);
5957}
5958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005959PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960"S.capitalize() -> unicode\n\
5961\n\
5962Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005963have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
5965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005966unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 return fixup(self, fixcapitalize);
5969}
5970
5971#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005972PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973"S.capwords() -> unicode\n\
5974\n\
5975Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005976normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
5978static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005979unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980{
5981 PyObject *list;
5982 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005983 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 /* Split into words */
5986 list = split(self, NULL, -1);
5987 if (!list)
5988 return NULL;
5989
5990 /* Capitalize each word */
5991 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5992 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5993 fixcapitalize);
5994 if (item == NULL)
5995 goto onError;
5996 Py_DECREF(PyList_GET_ITEM(list, i));
5997 PyList_SET_ITEM(list, i, item);
5998 }
5999
6000 /* Join the words to form a new string */
6001 item = PyUnicode_Join(NULL, list);
6002
6003onError:
6004 Py_DECREF(list);
6005 return (PyObject *)item;
6006}
6007#endif
6008
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006009/* Argument converter. Coerces to a single unicode character */
6010
6011static int
6012convert_uc(PyObject *obj, void *addr)
6013{
6014 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6015 PyObject *uniobj;
6016 Py_UNICODE *unistr;
6017
6018 uniobj = PyUnicode_FromObject(obj);
6019 if (uniobj == NULL) {
6020 PyErr_SetString(PyExc_TypeError,
6021 "The fill character cannot be converted to Unicode");
6022 return 0;
6023 }
6024 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6025 PyErr_SetString(PyExc_TypeError,
6026 "The fill character must be exactly one character long");
6027 Py_DECREF(uniobj);
6028 return 0;
6029 }
6030 unistr = PyUnicode_AS_UNICODE(uniobj);
6031 *fillcharloc = unistr[0];
6032 Py_DECREF(uniobj);
6033 return 1;
6034}
6035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006036PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006037"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006039Return S centered in a Unicode string of length width. Padding is\n\
6040done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
6042static PyObject *
6043unicode_center(PyUnicodeObject *self, PyObject *args)
6044{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006045 Py_ssize_t marg, left;
6046 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006047 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
Thomas Woutersde017742006-02-16 19:34:37 +00006049 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 return NULL;
6051
Tim Peters7a29bd52001-09-12 03:03:31 +00006052 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 Py_INCREF(self);
6054 return (PyObject*) self;
6055 }
6056
6057 marg = width - self->length;
6058 left = marg / 2 + (marg & width & 1);
6059
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006060 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061}
6062
Marc-André Lemburge5034372000-08-08 08:04:29 +00006063#if 0
6064
6065/* This code should go into some future Unicode collation support
6066 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006067 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006068
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006069/* speedy UTF-16 code point order comparison */
6070/* gleaned from: */
6071/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6072
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006073static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006074{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006075 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006076 0, 0, 0, 0, 0, 0, 0, 0,
6077 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006078 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006079};
6080
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081static int
6082unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6083{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006084 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006085
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 Py_UNICODE *s1 = str1->str;
6087 Py_UNICODE *s2 = str2->str;
6088
6089 len1 = str1->length;
6090 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006091
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006093 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006094
6095 c1 = *s1++;
6096 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006097
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006098 if (c1 > (1<<11) * 26)
6099 c1 += utf16Fixup[c1>>11];
6100 if (c2 > (1<<11) * 26)
6101 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006102 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006103
6104 if (c1 != c2)
6105 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006106
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006107 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
6109
6110 return (len1 < len2) ? -1 : (len1 != len2);
6111}
6112
Marc-André Lemburge5034372000-08-08 08:04:29 +00006113#else
6114
6115static int
6116unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6117{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006118 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006119
6120 Py_UNICODE *s1 = str1->str;
6121 Py_UNICODE *s2 = str2->str;
6122
6123 len1 = str1->length;
6124 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006125
Marc-André Lemburge5034372000-08-08 08:04:29 +00006126 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006127 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006128
Fredrik Lundh45714e92001-06-26 16:39:36 +00006129 c1 = *s1++;
6130 c2 = *s2++;
6131
6132 if (c1 != c2)
6133 return (c1 < c2) ? -1 : 1;
6134
Marc-André Lemburge5034372000-08-08 08:04:29 +00006135 len1--; len2--;
6136 }
6137
6138 return (len1 < len2) ? -1 : (len1 != len2);
6139}
6140
6141#endif
6142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143int PyUnicode_Compare(PyObject *left,
6144 PyObject *right)
6145{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006146 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6147 return unicode_compare((PyUnicodeObject *)left,
6148 (PyUnicodeObject *)right);
6149 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6150 (PyUnicode_Check(left) && PyString_Check(right))) {
6151 if (PyUnicode_Check(left))
6152 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6153 if (PyUnicode_Check(right))
6154 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6155 assert(PyString_Check(left));
6156 assert(PyString_Check(right));
6157 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006159 PyErr_Format(PyExc_TypeError,
6160 "Can't compare %.100s and %.100s",
6161 left->ob_type->tp_name,
6162 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 return -1;
6164}
6165
Martin v. Löwis5b222132007-06-10 09:51:05 +00006166int
6167PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6168{
6169 int i;
6170 Py_UNICODE *id;
6171 assert(PyUnicode_Check(uni));
6172 id = PyUnicode_AS_UNICODE(uni);
6173 /* Compare Unicode string and source character set string */
6174 for (i = 0; id[i] && str[i]; i++)
6175 if (id[i] != str[i])
6176 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6177 if (id[i])
6178 return 1; /* uni is longer */
6179 if (str[i])
6180 return -1; /* str is longer */
6181 return 0;
6182}
6183
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006184PyObject *PyUnicode_RichCompare(PyObject *left,
6185 PyObject *right,
6186 int op)
6187{
6188 int result;
6189
6190 result = PyUnicode_Compare(left, right);
6191 if (result == -1 && PyErr_Occurred())
6192 goto onError;
6193
6194 /* Convert the return value to a Boolean */
6195 switch (op) {
6196 case Py_EQ:
6197 result = (result == 0);
6198 break;
6199 case Py_NE:
6200 result = (result != 0);
6201 break;
6202 case Py_LE:
6203 result = (result <= 0);
6204 break;
6205 case Py_GE:
6206 result = (result >= 0);
6207 break;
6208 case Py_LT:
6209 result = (result == -1);
6210 break;
6211 case Py_GT:
6212 result = (result == 1);
6213 break;
6214 }
6215 return PyBool_FromLong(result);
6216
6217 onError:
6218
6219 /* Standard case
6220
6221 Type errors mean that PyUnicode_FromObject() could not convert
6222 one of the arguments (usually the right hand side) to Unicode,
6223 ie. we can't handle the comparison request. However, it is
6224 possible that the other object knows a comparison method, which
6225 is why we return Py_NotImplemented to give the other object a
6226 chance.
6227
6228 */
6229 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6230 PyErr_Clear();
6231 Py_INCREF(Py_NotImplemented);
6232 return Py_NotImplemented;
6233 }
6234 if (op != Py_EQ && op != Py_NE)
6235 return NULL;
6236
6237 /* Equality comparison.
6238
6239 This is a special case: we silence any PyExc_UnicodeDecodeError
6240 and instead turn it into a PyErr_UnicodeWarning.
6241
6242 */
6243 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6244 return NULL;
6245 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006246 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6247 (op == Py_EQ) ?
6248 "Unicode equal comparison "
6249 "failed to convert both arguments to Unicode - "
6250 "interpreting them as being unequal"
6251 :
6252 "Unicode unequal comparison "
6253 "failed to convert both arguments to Unicode - "
6254 "interpreting them as being unequal",
6255 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006256 return NULL;
6257 result = (op == Py_NE);
6258 return PyBool_FromLong(result);
6259}
6260
Guido van Rossum403d68b2000-03-13 15:55:09 +00006261int PyUnicode_Contains(PyObject *container,
6262 PyObject *element)
6263{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006264 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006265 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006266
6267 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006268 sub = PyUnicode_FromObject(element);
6269 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006270 PyErr_Format(PyExc_TypeError,
6271 "'in <string>' requires string as left operand, not %s",
6272 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006273 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006274 }
6275
Thomas Wouters477c8d52006-05-27 19:21:47 +00006276 str = PyUnicode_FromObject(container);
6277 if (!str) {
6278 Py_DECREF(sub);
6279 return -1;
6280 }
6281
6282 result = stringlib_contains_obj(str, sub);
6283
6284 Py_DECREF(str);
6285 Py_DECREF(sub);
6286
Guido van Rossum403d68b2000-03-13 15:55:09 +00006287 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006288}
6289
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290/* Concat to string or Unicode object giving a new Unicode object. */
6291
6292PyObject *PyUnicode_Concat(PyObject *left,
6293 PyObject *right)
6294{
6295 PyUnicodeObject *u = NULL, *v = NULL, *w;
6296
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006297 if (PyBytes_Check(left) || PyBytes_Check(right))
6298 return PyBytes_Concat(left, right);
6299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 /* Coerce the two arguments */
6301 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6302 if (u == NULL)
6303 goto onError;
6304 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6305 if (v == NULL)
6306 goto onError;
6307
6308 /* Shortcuts */
6309 if (v == unicode_empty) {
6310 Py_DECREF(v);
6311 return (PyObject *)u;
6312 }
6313 if (u == unicode_empty) {
6314 Py_DECREF(u);
6315 return (PyObject *)v;
6316 }
6317
6318 /* Concat the two Unicode strings */
6319 w = _PyUnicode_New(u->length + v->length);
6320 if (w == NULL)
6321 goto onError;
6322 Py_UNICODE_COPY(w->str, u->str, u->length);
6323 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6324
6325 Py_DECREF(u);
6326 Py_DECREF(v);
6327 return (PyObject *)w;
6328
6329onError:
6330 Py_XDECREF(u);
6331 Py_XDECREF(v);
6332 return NULL;
6333}
6334
Walter Dörwald1ab83302007-05-18 17:15:44 +00006335void
6336PyUnicode_Append(PyObject **pleft, PyObject *right)
6337{
6338 PyObject *new;
6339 if (*pleft == NULL)
6340 return;
6341 if (right == NULL || !PyUnicode_Check(*pleft)) {
6342 Py_DECREF(*pleft);
6343 *pleft = NULL;
6344 return;
6345 }
6346 new = PyUnicode_Concat(*pleft, right);
6347 Py_DECREF(*pleft);
6348 *pleft = new;
6349}
6350
6351void
6352PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6353{
6354 PyUnicode_Append(pleft, right);
6355 Py_XDECREF(right);
6356}
6357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006358PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359"S.count(sub[, start[, end]]) -> int\n\
6360\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006361Return the number of non-overlapping occurrences of substring sub in\n\
6362Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006363interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364
6365static PyObject *
6366unicode_count(PyUnicodeObject *self, PyObject *args)
6367{
6368 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006369 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006370 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 PyObject *result;
6372
Guido van Rossumb8872e62000-05-09 14:14:27 +00006373 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6374 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 return NULL;
6376
6377 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006378 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 if (substring == NULL)
6380 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006381
Thomas Wouters477c8d52006-05-27 19:21:47 +00006382 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
Thomas Wouters477c8d52006-05-27 19:21:47 +00006384 result = PyInt_FromSsize_t(
6385 stringlib_count(self->str + start, end - start,
6386 substring->str, substring->length)
6387 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388
6389 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006390
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 return result;
6392}
6393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006394PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006395"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006397Encodes S using the codec registered for encoding. encoding defaults\n\
6398to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006399handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6401'xmlcharrefreplace' as well as any other name registered with\n\
6402codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403
6404static PyObject *
6405unicode_encode(PyUnicodeObject *self, PyObject *args)
6406{
6407 char *encoding = NULL;
6408 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006409 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006410
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6412 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006413 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006414 if (v == NULL)
6415 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006416 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006417 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006418 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006419 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006420 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006421 Py_DECREF(v);
6422 return NULL;
6423 }
6424 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006425
6426 onError:
6427 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006428}
6429
6430PyDoc_STRVAR(decode__doc__,
6431"S.decode([encoding[,errors]]) -> string or unicode\n\
6432\n\
6433Decodes S using the codec registered for encoding. encoding defaults\n\
6434to the default encoding. errors may be given to set a different error\n\
6435handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6436a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6437as well as any other name registerd with codecs.register_error that is\n\
6438able to handle UnicodeDecodeErrors.");
6439
6440static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006441unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006442{
6443 char *encoding = NULL;
6444 char *errors = NULL;
6445 PyObject *v;
6446
6447 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6448 return NULL;
6449 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006450 if (v == NULL)
6451 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006452 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6453 PyErr_Format(PyExc_TypeError,
6454 "decoder did not return a string/unicode object "
6455 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006456 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006457 Py_DECREF(v);
6458 return NULL;
6459 }
6460 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006461
6462 onError:
6463 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464}
6465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006466PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467"S.expandtabs([tabsize]) -> unicode\n\
6468\n\
6469Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006470If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
6472static PyObject*
6473unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6474{
6475 Py_UNICODE *e;
6476 Py_UNICODE *p;
6477 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006478 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 PyUnicodeObject *u;
6480 int tabsize = 8;
6481
6482 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6483 return NULL;
6484
Thomas Wouters7e474022000-07-16 12:04:32 +00006485 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006486 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 e = self->str + self->length;
6488 for (p = self->str; p < e; p++)
6489 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006490 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006492 if (old_j > j) {
6493 PyErr_SetString(PyExc_OverflowError,
6494 "new string is too long");
6495 return NULL;
6496 }
6497 old_j = j;
6498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 }
6500 else {
6501 j++;
6502 if (*p == '\n' || *p == '\r') {
6503 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006504 old_j = j = 0;
6505 if (i < 0) {
6506 PyErr_SetString(PyExc_OverflowError,
6507 "new string is too long");
6508 return NULL;
6509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 }
6511 }
6512
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006513 if ((i + j) < 0) {
6514 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6515 return NULL;
6516 }
6517
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 /* Second pass: create output string and fill it */
6519 u = _PyUnicode_New(i + j);
6520 if (!u)
6521 return NULL;
6522
6523 j = 0;
6524 q = u->str;
6525
6526 for (p = self->str; p < e; p++)
6527 if (*p == '\t') {
6528 if (tabsize > 0) {
6529 i = tabsize - (j % tabsize);
6530 j += i;
6531 while (i--)
6532 *q++ = ' ';
6533 }
6534 }
6535 else {
6536 j++;
6537 *q++ = *p;
6538 if (*p == '\n' || *p == '\r')
6539 j = 0;
6540 }
6541
6542 return (PyObject*) u;
6543}
6544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006545PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546"S.find(sub [,start [,end]]) -> int\n\
6547\n\
6548Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006549such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550arguments start and end are interpreted as in slice notation.\n\
6551\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006552Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553
6554static PyObject *
6555unicode_find(PyUnicodeObject *self, PyObject *args)
6556{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006557 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006558 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006559 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006560 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561
Guido van Rossumb8872e62000-05-09 14:14:27 +00006562 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6563 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006565 substring = PyUnicode_FromObject(substring);
6566 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 return NULL;
6568
Thomas Wouters477c8d52006-05-27 19:21:47 +00006569 result = stringlib_find_slice(
6570 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6571 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6572 start, end
6573 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
6575 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006576
6577 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578}
6579
6580static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006581unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582{
6583 if (index < 0 || index >= self->length) {
6584 PyErr_SetString(PyExc_IndexError, "string index out of range");
6585 return NULL;
6586 }
6587
6588 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6589}
6590
6591static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006592unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006594 /* Since Unicode objects compare equal to their UTF-8 string
6595 counterparts, we hash the UTF-8 string. */
6596 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6597 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598}
6599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601"S.index(sub [,start [,end]]) -> int\n\
6602\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006603Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604
6605static PyObject *
6606unicode_index(PyUnicodeObject *self, PyObject *args)
6607{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006608 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006609 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006610 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006611 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
Guido van Rossumb8872e62000-05-09 14:14:27 +00006613 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6614 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006616 substring = PyUnicode_FromObject(substring);
6617 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 return NULL;
6619
Thomas Wouters477c8d52006-05-27 19:21:47 +00006620 result = stringlib_find_slice(
6621 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6622 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6623 start, end
6624 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625
6626 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006627
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 if (result < 0) {
6629 PyErr_SetString(PyExc_ValueError, "substring not found");
6630 return NULL;
6631 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006632
Martin v. Löwis18e16552006-02-15 17:27:45 +00006633 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634}
6635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006636PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006637"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006639Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006640at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
6642static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006643unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644{
6645 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6646 register const Py_UNICODE *e;
6647 int cased;
6648
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 /* Shortcut for single character strings */
6650 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006651 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006653 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006654 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006655 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006656
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 e = p + PyUnicode_GET_SIZE(self);
6658 cased = 0;
6659 for (; p < e; p++) {
6660 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006663 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 else if (!cased && Py_UNICODE_ISLOWER(ch))
6665 cased = 1;
6666 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006667 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668}
6669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006671"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006673Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006674at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675
6676static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006677unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678{
6679 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6680 register const Py_UNICODE *e;
6681 int cased;
6682
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 /* Shortcut for single character strings */
6684 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006685 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006687 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006688 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006689 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006690
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 e = p + PyUnicode_GET_SIZE(self);
6692 cased = 0;
6693 for (; p < e; p++) {
6694 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006695
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006697 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 else if (!cased && Py_UNICODE_ISUPPER(ch))
6699 cased = 1;
6700 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006701 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702}
6703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006704PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006705"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006707Return True if S is a titlecased string and there is at least one\n\
6708character in S, i.e. upper- and titlecase characters may only\n\
6709follow uncased characters and lowercase characters only cased ones.\n\
6710Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711
6712static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006713unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714{
6715 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6716 register const Py_UNICODE *e;
6717 int cased, previous_is_cased;
6718
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 /* Shortcut for single character strings */
6720 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006721 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6722 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006724 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006725 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006726 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006727
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 e = p + PyUnicode_GET_SIZE(self);
6729 cased = 0;
6730 previous_is_cased = 0;
6731 for (; p < e; p++) {
6732 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6735 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006736 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 previous_is_cased = 1;
6738 cased = 1;
6739 }
6740 else if (Py_UNICODE_ISLOWER(ch)) {
6741 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006742 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 previous_is_cased = 1;
6744 cased = 1;
6745 }
6746 else
6747 previous_is_cased = 0;
6748 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006749 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750}
6751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006752PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006753"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006755Return True if all characters in S are whitespace\n\
6756and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757
6758static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006759unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760{
6761 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6762 register const Py_UNICODE *e;
6763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 /* Shortcut for single character strings */
6765 if (PyUnicode_GET_SIZE(self) == 1 &&
6766 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006769 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006770 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006771 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006772
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 e = p + PyUnicode_GET_SIZE(self);
6774 for (; p < e; p++) {
6775 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006776 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006778 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779}
6780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006781PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006782"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006783\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006784Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006785and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006786
6787static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006788unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006789{
6790 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6791 register const Py_UNICODE *e;
6792
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006793 /* Shortcut for single character strings */
6794 if (PyUnicode_GET_SIZE(self) == 1 &&
6795 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006796 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006797
6798 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006799 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006800 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006801
6802 e = p + PyUnicode_GET_SIZE(self);
6803 for (; p < e; p++) {
6804 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006805 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006806 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006807 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006808}
6809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006810PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006811"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006812\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006813Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006814and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006815
6816static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006817unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006818{
6819 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6820 register const Py_UNICODE *e;
6821
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006822 /* Shortcut for single character strings */
6823 if (PyUnicode_GET_SIZE(self) == 1 &&
6824 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006826
6827 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006828 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006830
6831 e = p + PyUnicode_GET_SIZE(self);
6832 for (; p < e; p++) {
6833 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006834 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006835 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006836 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006837}
6838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006839PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006843False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
6845static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006846unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847{
6848 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6849 register const Py_UNICODE *e;
6850
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 /* Shortcut for single character strings */
6852 if (PyUnicode_GET_SIZE(self) == 1 &&
6853 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006856 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006857 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006858 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 e = p + PyUnicode_GET_SIZE(self);
6861 for (; p < e; p++) {
6862 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006863 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006865 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866}
6867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006868PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006871Return True if all characters in S are digits\n\
6872and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
6874static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006875unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876{
6877 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6878 register const Py_UNICODE *e;
6879
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 /* Shortcut for single character strings */
6881 if (PyUnicode_GET_SIZE(self) == 1 &&
6882 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006885 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006886 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006887 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 e = p + PyUnicode_GET_SIZE(self);
6890 for (; p < e; p++) {
6891 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006894 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895}
6896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006897PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006901False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902
6903static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006904unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905{
6906 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6907 register const Py_UNICODE *e;
6908
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 /* Shortcut for single character strings */
6910 if (PyUnicode_GET_SIZE(self) == 1 &&
6911 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006912 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006914 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006915 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006916 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006917
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 e = p + PyUnicode_GET_SIZE(self);
6919 for (; p < e; p++) {
6920 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006921 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006923 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924}
6925
Martin v. Löwis47383402007-08-15 07:32:56 +00006926int
6927PyUnicode_IsIdentifier(PyObject *self)
6928{
6929 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6930 register const Py_UNICODE *e;
6931
6932 /* Special case for empty strings */
6933 if (PyUnicode_GET_SIZE(self) == 0)
6934 return 0;
6935
6936 /* PEP 3131 says that the first character must be in
6937 XID_Start and subsequent characters in XID_Continue,
6938 and for the ASCII range, the 2.x rules apply (i.e
6939 start with letters and underscore, continue with
6940 letters, digits, underscore). However, given the current
6941 definition of XID_Start and XID_Continue, it is sufficient
6942 to check just for these, except that _ must be allowed
6943 as starting an identifier. */
6944 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6945 return 0;
6946
6947 e = p + PyUnicode_GET_SIZE(self);
6948 for (p++; p < e; p++) {
6949 if (!_PyUnicode_IsXidContinue(*p))
6950 return 0;
6951 }
6952 return 1;
6953}
6954
6955PyDoc_STRVAR(isidentifier__doc__,
6956"S.isidentifier() -> bool\n\
6957\n\
6958Return True if S is a valid identifier according\n\
6959to the language definition.");
6960
6961static PyObject*
6962unicode_isidentifier(PyObject *self)
6963{
6964 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6965}
6966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006967PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968"S.join(sequence) -> unicode\n\
6969\n\
6970Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
6973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006974unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006976 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977}
6978
Martin v. Löwis18e16552006-02-15 17:27:45 +00006979static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980unicode_length(PyUnicodeObject *self)
6981{
6982 return self->length;
6983}
6984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006986"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987\n\
6988Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006989done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990
6991static PyObject *
6992unicode_ljust(PyUnicodeObject *self, PyObject *args)
6993{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006994 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006995 Py_UNICODE fillchar = ' ';
6996
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006997 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 return NULL;
6999
Tim Peters7a29bd52001-09-12 03:03:31 +00007000 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 Py_INCREF(self);
7002 return (PyObject*) self;
7003 }
7004
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007005 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006}
7007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007008PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009"S.lower() -> unicode\n\
7010\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007011Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012
7013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007014unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 return fixup(self, fixlower);
7017}
7018
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007019#define LEFTSTRIP 0
7020#define RIGHTSTRIP 1
7021#define BOTHSTRIP 2
7022
7023/* Arrays indexed by above */
7024static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7025
7026#define STRIPNAME(i) (stripformat[i]+3)
7027
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007028/* externally visible for str.strip(unicode) */
7029PyObject *
7030_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7031{
7032 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007033 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007034 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007035 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7036 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007037
Thomas Wouters477c8d52006-05-27 19:21:47 +00007038 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7039
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040 i = 0;
7041 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007042 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7043 i++;
7044 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007045 }
7046
7047 j = len;
7048 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007049 do {
7050 j--;
7051 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7052 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007053 }
7054
7055 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007056 Py_INCREF(self);
7057 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058 }
7059 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007060 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007061}
7062
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
7064static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007065do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007067 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007068 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069
7070 i = 0;
7071 if (striptype != RIGHTSTRIP) {
7072 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7073 i++;
7074 }
7075 }
7076
7077 j = len;
7078 if (striptype != LEFTSTRIP) {
7079 do {
7080 j--;
7081 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7082 j++;
7083 }
7084
7085 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7086 Py_INCREF(self);
7087 return (PyObject*)self;
7088 }
7089 else
7090 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091}
7092
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093
7094static PyObject *
7095do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7096{
7097 PyObject *sep = NULL;
7098
7099 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7100 return NULL;
7101
7102 if (sep != NULL && sep != Py_None) {
7103 if (PyUnicode_Check(sep))
7104 return _PyUnicode_XStrip(self, striptype, sep);
7105 else if (PyString_Check(sep)) {
7106 PyObject *res;
7107 sep = PyUnicode_FromObject(sep);
7108 if (sep==NULL)
7109 return NULL;
7110 res = _PyUnicode_XStrip(self, striptype, sep);
7111 Py_DECREF(sep);
7112 return res;
7113 }
7114 else {
7115 PyErr_Format(PyExc_TypeError,
7116 "%s arg must be None, unicode or str",
7117 STRIPNAME(striptype));
7118 return NULL;
7119 }
7120 }
7121
7122 return do_strip(self, striptype);
7123}
7124
7125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007126PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007127"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007128\n\
7129Return a copy of the string S with leading and trailing\n\
7130whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007131If chars is given and not None, remove characters in chars instead.\n\
7132If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007133
7134static PyObject *
7135unicode_strip(PyUnicodeObject *self, PyObject *args)
7136{
7137 if (PyTuple_GET_SIZE(args) == 0)
7138 return do_strip(self, BOTHSTRIP); /* Common case */
7139 else
7140 return do_argstrip(self, BOTHSTRIP, args);
7141}
7142
7143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007145"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007146\n\
7147Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007148If chars is given and not None, remove characters in chars instead.\n\
7149If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007150
7151static PyObject *
7152unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7153{
7154 if (PyTuple_GET_SIZE(args) == 0)
7155 return do_strip(self, LEFTSTRIP); /* Common case */
7156 else
7157 return do_argstrip(self, LEFTSTRIP, args);
7158}
7159
7160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007161PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007162"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007163\n\
7164Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007165If chars is given and not None, remove characters in chars instead.\n\
7166If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007167
7168static PyObject *
7169unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7170{
7171 if (PyTuple_GET_SIZE(args) == 0)
7172 return do_strip(self, RIGHTSTRIP); /* Common case */
7173 else
7174 return do_argstrip(self, RIGHTSTRIP, args);
7175}
7176
7177
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007179unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180{
7181 PyUnicodeObject *u;
7182 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007183 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007184 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185
7186 if (len < 0)
7187 len = 0;
7188
Tim Peters7a29bd52001-09-12 03:03:31 +00007189 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 /* no repeat, return original string */
7191 Py_INCREF(str);
7192 return (PyObject*) str;
7193 }
Tim Peters8f422462000-09-09 06:13:41 +00007194
7195 /* ensure # of chars needed doesn't overflow int and # of bytes
7196 * needed doesn't overflow size_t
7197 */
7198 nchars = len * str->length;
7199 if (len && nchars / len != str->length) {
7200 PyErr_SetString(PyExc_OverflowError,
7201 "repeated string is too long");
7202 return NULL;
7203 }
7204 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7205 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7206 PyErr_SetString(PyExc_OverflowError,
7207 "repeated string is too long");
7208 return NULL;
7209 }
7210 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 if (!u)
7212 return NULL;
7213
7214 p = u->str;
7215
Thomas Wouters477c8d52006-05-27 19:21:47 +00007216 if (str->length == 1 && len > 0) {
7217 Py_UNICODE_FILL(p, str->str[0], len);
7218 } else {
7219 Py_ssize_t done = 0; /* number of characters copied this far */
7220 if (done < nchars) {
7221 Py_UNICODE_COPY(p, str->str, str->length);
7222 done = str->length;
7223 }
7224 while (done < nchars) {
7225 int n = (done <= nchars-done) ? done : nchars-done;
7226 Py_UNICODE_COPY(p+done, p, n);
7227 done += n;
7228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 }
7230
7231 return (PyObject*) u;
7232}
7233
7234PyObject *PyUnicode_Replace(PyObject *obj,
7235 PyObject *subobj,
7236 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007237 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238{
7239 PyObject *self;
7240 PyObject *str1;
7241 PyObject *str2;
7242 PyObject *result;
7243
7244 self = PyUnicode_FromObject(obj);
7245 if (self == NULL)
7246 return NULL;
7247 str1 = PyUnicode_FromObject(subobj);
7248 if (str1 == NULL) {
7249 Py_DECREF(self);
7250 return NULL;
7251 }
7252 str2 = PyUnicode_FromObject(replobj);
7253 if (str2 == NULL) {
7254 Py_DECREF(self);
7255 Py_DECREF(str1);
7256 return NULL;
7257 }
Tim Petersced69f82003-09-16 20:30:58 +00007258 result = replace((PyUnicodeObject *)self,
7259 (PyUnicodeObject *)str1,
7260 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 maxcount);
7262 Py_DECREF(self);
7263 Py_DECREF(str1);
7264 Py_DECREF(str2);
7265 return result;
7266}
7267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007268PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269"S.replace (old, new[, maxsplit]) -> unicode\n\
7270\n\
7271Return a copy of S with all occurrences of substring\n\
7272old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007273given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
7275static PyObject*
7276unicode_replace(PyUnicodeObject *self, PyObject *args)
7277{
7278 PyUnicodeObject *str1;
7279 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007280 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 PyObject *result;
7282
Martin v. Löwis18e16552006-02-15 17:27:45 +00007283 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 return NULL;
7285 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7286 if (str1 == NULL)
7287 return NULL;
7288 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007289 if (str2 == NULL) {
7290 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293
7294 result = replace(self, str1, str2, maxcount);
7295
7296 Py_DECREF(str1);
7297 Py_DECREF(str2);
7298 return result;
7299}
7300
7301static
7302PyObject *unicode_repr(PyObject *unicode)
7303{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007304 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007305 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007306 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7307 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7308
7309 /* XXX(nnorwitz): rather than over-allocating, it would be
7310 better to choose a different scheme. Perhaps scan the
7311 first N-chars of the string and allocate based on that size.
7312 */
7313 /* Initial allocation is based on the longest-possible unichr
7314 escape.
7315
7316 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7317 unichr, so in this case it's the longest unichr escape. In
7318 narrow (UTF-16) builds this is five chars per source unichr
7319 since there are two unichrs in the surrogate pair, so in narrow
7320 (UTF-16) builds it's not the longest unichr escape.
7321
7322 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7323 so in the narrow (UTF-16) build case it's the longest unichr
7324 escape.
7325 */
7326
Walter Dörwald1ab83302007-05-18 17:15:44 +00007327 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007328 2 /* quotes */
7329#ifdef Py_UNICODE_WIDE
7330 + 10*size
7331#else
7332 + 6*size
7333#endif
7334 + 1);
7335 if (repr == NULL)
7336 return NULL;
7337
Walter Dörwald1ab83302007-05-18 17:15:44 +00007338 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007339
7340 /* Add quote */
7341 *p++ = (findchar(s, size, '\'') &&
7342 !findchar(s, size, '"')) ? '"' : '\'';
7343 while (size-- > 0) {
7344 Py_UNICODE ch = *s++;
7345
7346 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007347 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007348 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007349 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007350 continue;
7351 }
7352
7353#ifdef Py_UNICODE_WIDE
7354 /* Map 21-bit characters to '\U00xxxxxx' */
7355 else if (ch >= 0x10000) {
7356 *p++ = '\\';
7357 *p++ = 'U';
7358 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7359 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7360 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7361 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7362 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7363 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7364 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7365 *p++ = hexdigits[ch & 0x0000000F];
7366 continue;
7367 }
7368#else
7369 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7370 else if (ch >= 0xD800 && ch < 0xDC00) {
7371 Py_UNICODE ch2;
7372 Py_UCS4 ucs;
7373
7374 ch2 = *s++;
7375 size--;
7376 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7377 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7378 *p++ = '\\';
7379 *p++ = 'U';
7380 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7381 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7382 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7383 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7384 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7385 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7386 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7387 *p++ = hexdigits[ucs & 0x0000000F];
7388 continue;
7389 }
7390 /* Fall through: isolated surrogates are copied as-is */
7391 s--;
7392 size++;
7393 }
7394#endif
7395
7396 /* Map 16-bit characters to '\uxxxx' */
7397 if (ch >= 256) {
7398 *p++ = '\\';
7399 *p++ = 'u';
7400 *p++ = hexdigits[(ch >> 12) & 0x000F];
7401 *p++ = hexdigits[(ch >> 8) & 0x000F];
7402 *p++ = hexdigits[(ch >> 4) & 0x000F];
7403 *p++ = hexdigits[ch & 0x000F];
7404 }
7405
7406 /* Map special whitespace to '\t', \n', '\r' */
7407 else if (ch == '\t') {
7408 *p++ = '\\';
7409 *p++ = 't';
7410 }
7411 else if (ch == '\n') {
7412 *p++ = '\\';
7413 *p++ = 'n';
7414 }
7415 else if (ch == '\r') {
7416 *p++ = '\\';
7417 *p++ = 'r';
7418 }
7419
7420 /* Map non-printable US ASCII to '\xhh' */
7421 else if (ch < ' ' || ch >= 0x7F) {
7422 *p++ = '\\';
7423 *p++ = 'x';
7424 *p++ = hexdigits[(ch >> 4) & 0x000F];
7425 *p++ = hexdigits[ch & 0x000F];
7426 }
7427
7428 /* Copy everything else as-is */
7429 else
7430 *p++ = (char) ch;
7431 }
7432 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007433 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007434
7435 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007436 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007437 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438}
7439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007440PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441"S.rfind(sub [,start [,end]]) -> int\n\
7442\n\
7443Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007444such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445arguments start and end are interpreted as in slice notation.\n\
7446\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007447Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
7449static PyObject *
7450unicode_rfind(PyUnicodeObject *self, PyObject *args)
7451{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007452 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007453 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007454 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007455 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456
Guido van Rossumb8872e62000-05-09 14:14:27 +00007457 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7458 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007460 substring = PyUnicode_FromObject(substring);
7461 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 return NULL;
7463
Thomas Wouters477c8d52006-05-27 19:21:47 +00007464 result = stringlib_rfind_slice(
7465 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7466 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7467 start, end
7468 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469
7470 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007471
7472 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473}
7474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007475PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476"S.rindex(sub [,start [,end]]) -> int\n\
7477\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007478Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
7480static PyObject *
7481unicode_rindex(PyUnicodeObject *self, PyObject *args)
7482{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007483 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007484 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007485 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007486 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487
Guido van Rossumb8872e62000-05-09 14:14:27 +00007488 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7489 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007491 substring = PyUnicode_FromObject(substring);
7492 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 return NULL;
7494
Thomas Wouters477c8d52006-05-27 19:21:47 +00007495 result = stringlib_rfind_slice(
7496 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7497 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7498 start, end
7499 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500
7501 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007502
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 if (result < 0) {
7504 PyErr_SetString(PyExc_ValueError, "substring not found");
7505 return NULL;
7506 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007507 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508}
7509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007510PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007511"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512\n\
7513Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007514done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
7516static PyObject *
7517unicode_rjust(PyUnicodeObject *self, PyObject *args)
7518{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007519 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007520 Py_UNICODE fillchar = ' ';
7521
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007522 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523 return NULL;
7524
Tim Peters7a29bd52001-09-12 03:03:31 +00007525 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 Py_INCREF(self);
7527 return (PyObject*) self;
7528 }
7529
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007530 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531}
7532
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007534unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535{
7536 /* standard clamping */
7537 if (start < 0)
7538 start = 0;
7539 if (end < 0)
7540 end = 0;
7541 if (end > self->length)
7542 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007543 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 /* full slice, return original string */
7545 Py_INCREF(self);
7546 return (PyObject*) self;
7547 }
7548 if (start > end)
7549 start = end;
7550 /* copy slice */
7551 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7552 end - start);
7553}
7554
7555PyObject *PyUnicode_Split(PyObject *s,
7556 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007557 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558{
7559 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007560
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561 s = PyUnicode_FromObject(s);
7562 if (s == NULL)
7563 return NULL;
7564 if (sep != NULL) {
7565 sep = PyUnicode_FromObject(sep);
7566 if (sep == NULL) {
7567 Py_DECREF(s);
7568 return NULL;
7569 }
7570 }
7571
7572 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7573
7574 Py_DECREF(s);
7575 Py_XDECREF(sep);
7576 return result;
7577}
7578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007579PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580"S.split([sep [,maxsplit]]) -> list of strings\n\
7581\n\
7582Return a list of the words in S, using sep as the\n\
7583delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007584splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007585any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586
7587static PyObject*
7588unicode_split(PyUnicodeObject *self, PyObject *args)
7589{
7590 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007591 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
Martin v. Löwis18e16552006-02-15 17:27:45 +00007593 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 return NULL;
7595
7596 if (substring == Py_None)
7597 return split(self, NULL, maxcount);
7598 else if (PyUnicode_Check(substring))
7599 return split(self, (PyUnicodeObject *)substring, maxcount);
7600 else
7601 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7602}
7603
Thomas Wouters477c8d52006-05-27 19:21:47 +00007604PyObject *
7605PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7606{
7607 PyObject* str_obj;
7608 PyObject* sep_obj;
7609 PyObject* out;
7610
7611 str_obj = PyUnicode_FromObject(str_in);
7612 if (!str_obj)
7613 return NULL;
7614 sep_obj = PyUnicode_FromObject(sep_in);
7615 if (!sep_obj) {
7616 Py_DECREF(str_obj);
7617 return NULL;
7618 }
7619
7620 out = stringlib_partition(
7621 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7622 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7623 );
7624
7625 Py_DECREF(sep_obj);
7626 Py_DECREF(str_obj);
7627
7628 return out;
7629}
7630
7631
7632PyObject *
7633PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7634{
7635 PyObject* str_obj;
7636 PyObject* sep_obj;
7637 PyObject* out;
7638
7639 str_obj = PyUnicode_FromObject(str_in);
7640 if (!str_obj)
7641 return NULL;
7642 sep_obj = PyUnicode_FromObject(sep_in);
7643 if (!sep_obj) {
7644 Py_DECREF(str_obj);
7645 return NULL;
7646 }
7647
7648 out = stringlib_rpartition(
7649 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7650 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7651 );
7652
7653 Py_DECREF(sep_obj);
7654 Py_DECREF(str_obj);
7655
7656 return out;
7657}
7658
7659PyDoc_STRVAR(partition__doc__,
7660"S.partition(sep) -> (head, sep, tail)\n\
7661\n\
7662Searches for the separator sep in S, and returns the part before it,\n\
7663the separator itself, and the part after it. If the separator is not\n\
7664found, returns S and two empty strings.");
7665
7666static PyObject*
7667unicode_partition(PyUnicodeObject *self, PyObject *separator)
7668{
7669 return PyUnicode_Partition((PyObject *)self, separator);
7670}
7671
7672PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007673"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007674\n\
7675Searches for the separator sep in S, starting at the end of S, and returns\n\
7676the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007677separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007678
7679static PyObject*
7680unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7681{
7682 return PyUnicode_RPartition((PyObject *)self, separator);
7683}
7684
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007685PyObject *PyUnicode_RSplit(PyObject *s,
7686 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007687 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007688{
7689 PyObject *result;
7690
7691 s = PyUnicode_FromObject(s);
7692 if (s == NULL)
7693 return NULL;
7694 if (sep != NULL) {
7695 sep = PyUnicode_FromObject(sep);
7696 if (sep == NULL) {
7697 Py_DECREF(s);
7698 return NULL;
7699 }
7700 }
7701
7702 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7703
7704 Py_DECREF(s);
7705 Py_XDECREF(sep);
7706 return result;
7707}
7708
7709PyDoc_STRVAR(rsplit__doc__,
7710"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7711\n\
7712Return a list of the words in S, using sep as the\n\
7713delimiter string, starting at the end of the string and\n\
7714working to the front. If maxsplit is given, at most maxsplit\n\
7715splits are done. If sep is not specified, any whitespace string\n\
7716is a separator.");
7717
7718static PyObject*
7719unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7720{
7721 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007722 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007723
Martin v. Löwis18e16552006-02-15 17:27:45 +00007724 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007725 return NULL;
7726
7727 if (substring == Py_None)
7728 return rsplit(self, NULL, maxcount);
7729 else if (PyUnicode_Check(substring))
7730 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7731 else
7732 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7733}
7734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007735PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007736"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737\n\
7738Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007739Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007740is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741
7742static PyObject*
7743unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7744{
Guido van Rossum86662912000-04-11 15:38:46 +00007745 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746
Guido van Rossum86662912000-04-11 15:38:46 +00007747 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 return NULL;
7749
Guido van Rossum86662912000-04-11 15:38:46 +00007750 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751}
7752
7753static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007754PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755{
Walter Dörwald346737f2007-05-31 10:44:43 +00007756 if (PyUnicode_CheckExact(self)) {
7757 Py_INCREF(self);
7758 return self;
7759 } else
7760 /* Subtype -- return genuine unicode string with the same value. */
7761 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7762 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763}
7764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007765PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766"S.swapcase() -> unicode\n\
7767\n\
7768Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007769and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770
7771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007772unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774 return fixup(self, fixswapcase);
7775}
7776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007777PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778"S.translate(table) -> unicode\n\
7779\n\
7780Return a copy of the string S, where all characters have been mapped\n\
7781through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007782Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7783Unmapped characters are left untouched. Characters mapped to None\n\
7784are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785
7786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007787unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788{
Tim Petersced69f82003-09-16 20:30:58 +00007789 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007791 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 "ignore");
7793}
7794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007795PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796"S.upper() -> unicode\n\
7797\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007798Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799
7800static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007801unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803 return fixup(self, fixupper);
7804}
7805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007806PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807"S.zfill(width) -> unicode\n\
7808\n\
7809Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007810of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811
7812static PyObject *
7813unicode_zfill(PyUnicodeObject *self, PyObject *args)
7814{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007815 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 PyUnicodeObject *u;
7817
Martin v. Löwis18e16552006-02-15 17:27:45 +00007818 Py_ssize_t width;
7819 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820 return NULL;
7821
7822 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007823 if (PyUnicode_CheckExact(self)) {
7824 Py_INCREF(self);
7825 return (PyObject*) self;
7826 }
7827 else
7828 return PyUnicode_FromUnicode(
7829 PyUnicode_AS_UNICODE(self),
7830 PyUnicode_GET_SIZE(self)
7831 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 }
7833
7834 fill = width - self->length;
7835
7836 u = pad(self, fill, 0, '0');
7837
Walter Dörwald068325e2002-04-15 13:36:47 +00007838 if (u == NULL)
7839 return NULL;
7840
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 if (u->str[fill] == '+' || u->str[fill] == '-') {
7842 /* move sign to beginning of string */
7843 u->str[0] = u->str[fill];
7844 u->str[fill] = '0';
7845 }
7846
7847 return (PyObject*) u;
7848}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849
7850#if 0
7851static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007852unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 return PyInt_FromLong(unicode_freelist_size);
7855}
7856#endif
7857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007858PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007859"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007861Return True if S starts with the specified prefix, False otherwise.\n\
7862With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007863With optional end, stop comparing S at that position.\n\
7864prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865
7866static PyObject *
7867unicode_startswith(PyUnicodeObject *self,
7868 PyObject *args)
7869{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007870 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007872 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007873 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007874 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007876 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007877 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007879 if (PyTuple_Check(subobj)) {
7880 Py_ssize_t i;
7881 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7882 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7883 PyTuple_GET_ITEM(subobj, i));
7884 if (substring == NULL)
7885 return NULL;
7886 result = tailmatch(self, substring, start, end, -1);
7887 Py_DECREF(substring);
7888 if (result) {
7889 Py_RETURN_TRUE;
7890 }
7891 }
7892 /* nothing matched */
7893 Py_RETURN_FALSE;
7894 }
7895 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007897 return NULL;
7898 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007900 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901}
7902
7903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007904PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007905"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007907Return True if S ends with the specified suffix, False otherwise.\n\
7908With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007909With optional end, stop comparing S at that position.\n\
7910suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911
7912static PyObject *
7913unicode_endswith(PyUnicodeObject *self,
7914 PyObject *args)
7915{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007916 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007918 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007919 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007920 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007922 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7923 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007925 if (PyTuple_Check(subobj)) {
7926 Py_ssize_t i;
7927 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7928 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7929 PyTuple_GET_ITEM(subobj, i));
7930 if (substring == NULL)
7931 return NULL;
7932 result = tailmatch(self, substring, start, end, +1);
7933 Py_DECREF(substring);
7934 if (result) {
7935 Py_RETURN_TRUE;
7936 }
7937 }
7938 Py_RETURN_FALSE;
7939 }
7940 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007944 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007946 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947}
7948
Eric Smith8c663262007-08-25 02:26:07 +00007949#include "stringlib/string_format.h"
7950
7951PyDoc_STRVAR(format__doc__,
7952"S.format(*args, **kwargs) -> unicode\n\
7953\n\
7954");
7955
7956static PyObject *
7957unicode_format(PyObject *self, PyObject *args, PyObject *kwds)
7958{
7959 /* this calls into stringlib/string_format.h because it can be
7960 included for either string or unicode. this is needed for
7961 python 2.6. */
7962 return do_string_format(self, args, kwds);
7963}
7964
7965
7966PyDoc_STRVAR(p_format__doc__,
7967"S.__format__(format_spec) -> unicode\n\
7968\n\
7969");
7970
7971static PyObject *
7972unicode__format__(PyObject *self, PyObject *args)
7973{
7974 return unicode_unicode__format__(self, args);
7975}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007977
7978static PyObject *
7979unicode_getnewargs(PyUnicodeObject *v)
7980{
7981 return Py_BuildValue("(u#)", v->str, v->length);
7982}
7983
7984
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985static PyMethodDef unicode_methods[] = {
7986
7987 /* Order is according to common usage: often used methods should
7988 appear first, since lookup is done sequentially. */
7989
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007990 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7991 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7992 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007993 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007994 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7995 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7996 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7997 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7998 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7999 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8000 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008001 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008002 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8003 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8004 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008005 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00008006 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008007/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8008 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8009 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8010 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008011 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008012 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008013 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008014 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008015 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8016 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8017 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8018 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8019 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8020 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8021 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8022 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8023 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8024 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8025 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8026 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8027 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8028 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008029 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008030 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith8c663262007-08-25 02:26:07 +00008031 {"format", (PyCFunction) unicode_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8032 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008033 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8034 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008035#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008036 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037#endif
8038
8039#if 0
8040 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008041 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042#endif
8043
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008044 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 {NULL, NULL}
8046};
8047
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008048static PyObject *
8049unicode_mod(PyObject *v, PyObject *w)
8050{
8051 if (!PyUnicode_Check(v)) {
8052 Py_INCREF(Py_NotImplemented);
8053 return Py_NotImplemented;
8054 }
8055 return PyUnicode_Format(v, w);
8056}
8057
8058static PyNumberMethods unicode_as_number = {
8059 0, /*nb_add*/
8060 0, /*nb_subtract*/
8061 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008062 unicode_mod, /*nb_remainder*/
8063};
8064
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008066 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008067 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008068 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8069 (ssizeargfunc) unicode_getitem, /* sq_item */
8070 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 0, /* sq_ass_item */
8072 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008073 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074};
8075
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008076static PyObject*
8077unicode_subscript(PyUnicodeObject* self, PyObject* item)
8078{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008079 if (PyIndex_Check(item)) {
8080 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008081 if (i == -1 && PyErr_Occurred())
8082 return NULL;
8083 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008084 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008085 return unicode_getitem(self, i);
8086 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008087 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008088 Py_UNICODE* source_buf;
8089 Py_UNICODE* result_buf;
8090 PyObject* result;
8091
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008092 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008093 &start, &stop, &step, &slicelength) < 0) {
8094 return NULL;
8095 }
8096
8097 if (slicelength <= 0) {
8098 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008099 } else if (start == 0 && step == 1 && slicelength == self->length &&
8100 PyUnicode_CheckExact(self)) {
8101 Py_INCREF(self);
8102 return (PyObject *)self;
8103 } else if (step == 1) {
8104 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008105 } else {
8106 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008107 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8108 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008109
8110 if (result_buf == NULL)
8111 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008112
8113 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8114 result_buf[i] = source_buf[cur];
8115 }
Tim Petersced69f82003-09-16 20:30:58 +00008116
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008117 result = PyUnicode_FromUnicode(result_buf, slicelength);
8118 PyMem_FREE(result_buf);
8119 return result;
8120 }
8121 } else {
8122 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8123 return NULL;
8124 }
8125}
8126
8127static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008129 (binaryfunc)unicode_subscript, /* mp_subscript */
8130 (objobjargproc)0, /* mp_ass_subscript */
8131};
8132
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133
8134static int
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008135unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008138 if (flags & PyBUF_CHARACTER) {
8139 PyObject *str;
8140
8141 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8142 if (str == NULL) return -1;
8143 return PyBuffer_FillInfo(view, (void *)PyString_AS_STRING(str),
8144 PyString_GET_SIZE(str), 1, flags);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 }
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008146 else {
8147 return PyBuffer_FillInfo(view, (void *)self->str,
8148 PyUnicode_GET_DATA_SIZE(self), 1, flags);
8149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150}
8151
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008152
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153/* Helpers for PyUnicode_Format() */
8154
8155static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008158 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 if (argidx < arglen) {
8160 (*p_argidx)++;
8161 if (arglen < 0)
8162 return args;
8163 else
8164 return PyTuple_GetItem(args, argidx);
8165 }
8166 PyErr_SetString(PyExc_TypeError,
8167 "not enough arguments for format string");
8168 return NULL;
8169}
8170
8171#define F_LJUST (1<<0)
8172#define F_SIGN (1<<1)
8173#define F_BLANK (1<<2)
8174#define F_ALT (1<<3)
8175#define F_ZERO (1<<4)
8176
Martin v. Löwis18e16552006-02-15 17:27:45 +00008177static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008178strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008180 register Py_ssize_t i;
8181 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182 for (i = len - 1; i >= 0; i--)
8183 buffer[i] = (Py_UNICODE) charbuffer[i];
8184
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 return len;
8186}
8187
Neal Norwitzfc76d632006-01-10 06:03:13 +00008188static int
8189doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8190{
Tim Peters15231542006-02-16 01:08:01 +00008191 Py_ssize_t result;
8192
Neal Norwitzfc76d632006-01-10 06:03:13 +00008193 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008194 result = strtounicode(buffer, (char *)buffer);
8195 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008196}
8197
8198static int
8199longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8200{
Tim Peters15231542006-02-16 01:08:01 +00008201 Py_ssize_t result;
8202
Neal Norwitzfc76d632006-01-10 06:03:13 +00008203 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008204 result = strtounicode(buffer, (char *)buffer);
8205 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008206}
8207
Guido van Rossum078151d2002-08-11 04:24:12 +00008208/* XXX To save some code duplication, formatfloat/long/int could have been
8209 shared with stringobject.c, converting from 8-bit to Unicode after the
8210 formatting is done. */
8211
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212static int
8213formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008214 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 int flags,
8216 int prec,
8217 int type,
8218 PyObject *v)
8219{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008220 /* fmt = '%#.' + `prec` + `type`
8221 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 char fmt[20];
8223 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008224
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 x = PyFloat_AsDouble(v);
8226 if (x == -1.0 && PyErr_Occurred())
8227 return -1;
8228 if (prec < 0)
8229 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8231 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008232 /* Worst case length calc to ensure no buffer overrun:
8233
8234 'g' formats:
8235 fmt = %#.<prec>g
8236 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8237 for any double rep.)
8238 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8239
8240 'f' formats:
8241 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8242 len = 1 + 50 + 1 + prec = 52 + prec
8243
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008244 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008245 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008246
8247 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008248 if (((type == 'g' || type == 'G') &&
8249 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008250 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008251 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008252 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008253 return -1;
8254 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008255 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8256 (flags&F_ALT) ? "#" : "",
8257 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008258 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259}
8260
Tim Peters38fd5b62000-09-21 05:43:11 +00008261static PyObject*
8262formatlong(PyObject *val, int flags, int prec, int type)
8263{
8264 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008265 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008266 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008267 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008268
8269 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8270 if (!str)
8271 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008272 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008273 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008274 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008275}
8276
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277static int
8278formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008279 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 int flags,
8281 int prec,
8282 int type,
8283 PyObject *v)
8284{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008285 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008286 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8287 * + 1 + 1
8288 * = 24
8289 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008290 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008291 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 long x;
8293
8294 x = PyInt_AsLong(v);
8295 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008296 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008297 if (x < 0 && type == 'u') {
8298 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008299 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008300 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8301 sign = "-";
8302 else
8303 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008305 prec = 1;
8306
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008307 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8308 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008309 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008310 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008311 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008312 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008313 return -1;
8314 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008315
8316 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008317 (type == 'x' || type == 'X' || type == 'o')) {
8318 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008319 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008320 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008321 * - when 0 is being converted, the C standard leaves off
8322 * the '0x' or '0X', which is inconsistent with other
8323 * %#x/%#X conversions and inconsistent with Python's
8324 * hex() function
8325 * - there are platforms that violate the standard and
8326 * convert 0 with the '0x' or '0X'
8327 * (Metrowerks, Compaq Tru64)
8328 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008329 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008330 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008331 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008332 * We can achieve the desired consistency by inserting our
8333 * own '0x' or '0X' prefix, and substituting %x/%X in place
8334 * of %#x/%#X.
8335 *
8336 * Note that this is the same approach as used in
8337 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008338 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008339 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8340 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008341 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008342 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008343 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8344 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008345 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008346 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008347 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008348 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008349 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008350 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351}
8352
8353static int
8354formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008355 size_t buflen,
8356 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008358 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008359 if (PyUnicode_Check(v)) {
8360 if (PyUnicode_GET_SIZE(v) != 1)
8361 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008363 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008365 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008366 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008367 goto onError;
8368 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370
8371 else {
8372 /* Integer input truncated to a character */
8373 long x;
8374 x = PyInt_AsLong(v);
8375 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008376 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008377#ifdef Py_UNICODE_WIDE
8378 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008379 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008380 "%c arg not in range(0x110000) "
8381 "(wide Python build)");
8382 return -1;
8383 }
8384#else
8385 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008386 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008387 "%c arg not in range(0x10000) "
8388 "(narrow Python build)");
8389 return -1;
8390 }
8391#endif
8392 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 }
8394 buf[1] = '\0';
8395 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008396
8397 onError:
8398 PyErr_SetString(PyExc_TypeError,
8399 "%c requires int or char");
8400 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401}
8402
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008403/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8404
8405 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8406 chars are formatted. XXX This is a magic number. Each formatting
8407 routine does bounds checking to ensure no overflow, but a better
8408 solution may be to malloc a buffer of appropriate size for each
8409 format. For now, the current solution is sufficient.
8410*/
8411#define FORMATBUFLEN (size_t)120
8412
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413PyObject *PyUnicode_Format(PyObject *format,
8414 PyObject *args)
8415{
8416 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008417 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418 int args_owned = 0;
8419 PyUnicodeObject *result = NULL;
8420 PyObject *dict = NULL;
8421 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008422
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 if (format == NULL || args == NULL) {
8424 PyErr_BadInternalCall();
8425 return NULL;
8426 }
8427 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008428 if (uformat == NULL)
8429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 fmt = PyUnicode_AS_UNICODE(uformat);
8431 fmtcnt = PyUnicode_GET_SIZE(uformat);
8432
8433 reslen = rescnt = fmtcnt + 100;
8434 result = _PyUnicode_New(reslen);
8435 if (result == NULL)
8436 goto onError;
8437 res = PyUnicode_AS_UNICODE(result);
8438
8439 if (PyTuple_Check(args)) {
8440 arglen = PyTuple_Size(args);
8441 argidx = 0;
8442 }
8443 else {
8444 arglen = -1;
8445 argidx = -2;
8446 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008447 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008448 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 dict = args;
8450
8451 while (--fmtcnt >= 0) {
8452 if (*fmt != '%') {
8453 if (--rescnt < 0) {
8454 rescnt = fmtcnt + 100;
8455 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008456 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008457 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8459 --rescnt;
8460 }
8461 *res++ = *fmt++;
8462 }
8463 else {
8464 /* Got a format specifier */
8465 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008466 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 Py_UNICODE c = '\0';
8469 Py_UNICODE fill;
8470 PyObject *v = NULL;
8471 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008472 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008474 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008475 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476
8477 fmt++;
8478 if (*fmt == '(') {
8479 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008480 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481 PyObject *key;
8482 int pcount = 1;
8483
8484 if (dict == NULL) {
8485 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008486 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 goto onError;
8488 }
8489 ++fmt;
8490 --fmtcnt;
8491 keystart = fmt;
8492 /* Skip over balanced parentheses */
8493 while (pcount > 0 && --fmtcnt >= 0) {
8494 if (*fmt == ')')
8495 --pcount;
8496 else if (*fmt == '(')
8497 ++pcount;
8498 fmt++;
8499 }
8500 keylen = fmt - keystart - 1;
8501 if (fmtcnt < 0 || pcount > 0) {
8502 PyErr_SetString(PyExc_ValueError,
8503 "incomplete format key");
8504 goto onError;
8505 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008506#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008507 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 then looked up since Python uses strings to hold
8509 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008510 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 key = PyUnicode_EncodeUTF8(keystart,
8512 keylen,
8513 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008514#else
8515 key = PyUnicode_FromUnicode(keystart, keylen);
8516#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 if (key == NULL)
8518 goto onError;
8519 if (args_owned) {
8520 Py_DECREF(args);
8521 args_owned = 0;
8522 }
8523 args = PyObject_GetItem(dict, key);
8524 Py_DECREF(key);
8525 if (args == NULL) {
8526 goto onError;
8527 }
8528 args_owned = 1;
8529 arglen = -1;
8530 argidx = -2;
8531 }
8532 while (--fmtcnt >= 0) {
8533 switch (c = *fmt++) {
8534 case '-': flags |= F_LJUST; continue;
8535 case '+': flags |= F_SIGN; continue;
8536 case ' ': flags |= F_BLANK; continue;
8537 case '#': flags |= F_ALT; continue;
8538 case '0': flags |= F_ZERO; continue;
8539 }
8540 break;
8541 }
8542 if (c == '*') {
8543 v = getnextarg(args, arglen, &argidx);
8544 if (v == NULL)
8545 goto onError;
8546 if (!PyInt_Check(v)) {
8547 PyErr_SetString(PyExc_TypeError,
8548 "* wants int");
8549 goto onError;
8550 }
8551 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008552 if (width == -1 && PyErr_Occurred())
8553 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 if (width < 0) {
8555 flags |= F_LJUST;
8556 width = -width;
8557 }
8558 if (--fmtcnt >= 0)
8559 c = *fmt++;
8560 }
8561 else if (c >= '0' && c <= '9') {
8562 width = c - '0';
8563 while (--fmtcnt >= 0) {
8564 c = *fmt++;
8565 if (c < '0' || c > '9')
8566 break;
8567 if ((width*10) / 10 != width) {
8568 PyErr_SetString(PyExc_ValueError,
8569 "width too big");
8570 goto onError;
8571 }
8572 width = width*10 + (c - '0');
8573 }
8574 }
8575 if (c == '.') {
8576 prec = 0;
8577 if (--fmtcnt >= 0)
8578 c = *fmt++;
8579 if (c == '*') {
8580 v = getnextarg(args, arglen, &argidx);
8581 if (v == NULL)
8582 goto onError;
8583 if (!PyInt_Check(v)) {
8584 PyErr_SetString(PyExc_TypeError,
8585 "* wants int");
8586 goto onError;
8587 }
8588 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008589 if (prec == -1 && PyErr_Occurred())
8590 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 if (prec < 0)
8592 prec = 0;
8593 if (--fmtcnt >= 0)
8594 c = *fmt++;
8595 }
8596 else if (c >= '0' && c <= '9') {
8597 prec = c - '0';
8598 while (--fmtcnt >= 0) {
8599 c = Py_CHARMASK(*fmt++);
8600 if (c < '0' || c > '9')
8601 break;
8602 if ((prec*10) / 10 != prec) {
8603 PyErr_SetString(PyExc_ValueError,
8604 "prec too big");
8605 goto onError;
8606 }
8607 prec = prec*10 + (c - '0');
8608 }
8609 }
8610 } /* prec */
8611 if (fmtcnt >= 0) {
8612 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 if (--fmtcnt >= 0)
8614 c = *fmt++;
8615 }
8616 }
8617 if (fmtcnt < 0) {
8618 PyErr_SetString(PyExc_ValueError,
8619 "incomplete format");
8620 goto onError;
8621 }
8622 if (c != '%') {
8623 v = getnextarg(args, arglen, &argidx);
8624 if (v == NULL)
8625 goto onError;
8626 }
8627 sign = 0;
8628 fill = ' ';
8629 switch (c) {
8630
8631 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008632 pbuf = formatbuf;
8633 /* presume that buffer length is at least 1 */
8634 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 len = 1;
8636 break;
8637
8638 case 's':
8639 case 'r':
8640 if (PyUnicode_Check(v) && c == 's') {
8641 temp = v;
8642 Py_INCREF(temp);
8643 }
8644 else {
8645 PyObject *unicode;
8646 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008647 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 else
8649 temp = PyObject_Repr(v);
8650 if (temp == NULL)
8651 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008652 if (PyUnicode_Check(temp))
8653 /* nothing to do */;
8654 else if (PyString_Check(temp)) {
8655 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008656 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008658 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008660 Py_DECREF(temp);
8661 temp = unicode;
8662 if (temp == NULL)
8663 goto onError;
8664 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008665 else {
8666 Py_DECREF(temp);
8667 PyErr_SetString(PyExc_TypeError,
8668 "%s argument has non-string str()");
8669 goto onError;
8670 }
8671 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008672 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 len = PyUnicode_GET_SIZE(temp);
8674 if (prec >= 0 && len > prec)
8675 len = prec;
8676 break;
8677
8678 case 'i':
8679 case 'd':
8680 case 'u':
8681 case 'o':
8682 case 'x':
8683 case 'X':
8684 if (c == 'i')
8685 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008686 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008687 temp = formatlong(v, flags, prec, c);
8688 if (!temp)
8689 goto onError;
8690 pbuf = PyUnicode_AS_UNICODE(temp);
8691 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008692 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008694 else {
8695 pbuf = formatbuf;
8696 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8697 flags, prec, c, v);
8698 if (len < 0)
8699 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008700 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008701 }
8702 if (flags & F_ZERO)
8703 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 break;
8705
8706 case 'e':
8707 case 'E':
8708 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008709 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 case 'g':
8711 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008712 if (c == 'F')
8713 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008714 pbuf = formatbuf;
8715 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8716 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 if (len < 0)
8718 goto onError;
8719 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008720 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 fill = '0';
8722 break;
8723
8724 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008725 pbuf = formatbuf;
8726 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 if (len < 0)
8728 goto onError;
8729 break;
8730
8731 default:
8732 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008733 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008734 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008735 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008736 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008737 (Py_ssize_t)(fmt - 1 -
8738 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 goto onError;
8740 }
8741 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008742 if (*pbuf == '-' || *pbuf == '+') {
8743 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 len--;
8745 }
8746 else if (flags & F_SIGN)
8747 sign = '+';
8748 else if (flags & F_BLANK)
8749 sign = ' ';
8750 else
8751 sign = 0;
8752 }
8753 if (width < len)
8754 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008755 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 reslen -= rescnt;
8757 rescnt = width + fmtcnt + 100;
8758 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008759 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008760 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008761 PyErr_NoMemory();
8762 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008763 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008764 if (_PyUnicode_Resize(&result, reslen) < 0) {
8765 Py_XDECREF(temp);
8766 goto onError;
8767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 res = PyUnicode_AS_UNICODE(result)
8769 + reslen - rescnt;
8770 }
8771 if (sign) {
8772 if (fill != ' ')
8773 *res++ = sign;
8774 rescnt--;
8775 if (width > len)
8776 width--;
8777 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008778 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008779 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008780 assert(pbuf[1] == c);
8781 if (fill != ' ') {
8782 *res++ = *pbuf++;
8783 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008784 }
Tim Petersfff53252001-04-12 18:38:48 +00008785 rescnt -= 2;
8786 width -= 2;
8787 if (width < 0)
8788 width = 0;
8789 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 if (width > len && !(flags & F_LJUST)) {
8792 do {
8793 --rescnt;
8794 *res++ = fill;
8795 } while (--width > len);
8796 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008797 if (fill == ' ') {
8798 if (sign)
8799 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008800 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008801 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008802 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008803 *res++ = *pbuf++;
8804 *res++ = *pbuf++;
8805 }
8806 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008807 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808 res += len;
8809 rescnt -= len;
8810 while (--width >= len) {
8811 --rescnt;
8812 *res++ = ' ';
8813 }
8814 if (dict && (argidx < arglen) && c != '%') {
8815 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008816 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008817 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 goto onError;
8819 }
8820 Py_XDECREF(temp);
8821 } /* '%' */
8822 } /* until end */
8823 if (argidx < arglen && !dict) {
8824 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008825 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 goto onError;
8827 }
8828
Thomas Woutersa96affe2006-03-12 00:29:36 +00008829 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8830 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 if (args_owned) {
8832 Py_DECREF(args);
8833 }
8834 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835 return (PyObject *)result;
8836
8837 onError:
8838 Py_XDECREF(result);
8839 Py_DECREF(uformat);
8840 if (args_owned) {
8841 Py_DECREF(args);
8842 }
8843 return NULL;
8844}
8845
8846static PyBufferProcs unicode_as_buffer = {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008847 (getbufferproc) unicode_buffer_getbuffer,
8848 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849};
8850
Jeremy Hylton938ace62002-07-17 16:30:39 +00008851static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008852unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8853
Tim Peters6d6c1a32001-08-02 04:15:00 +00008854static PyObject *
8855unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8856{
8857 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008858 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008859 char *encoding = NULL;
8860 char *errors = NULL;
8861
Guido van Rossume023fe02001-08-30 03:12:59 +00008862 if (type != &PyUnicode_Type)
8863 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008864 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8865 kwlist, &x, &encoding, &errors))
8866 return NULL;
8867 if (x == NULL)
8868 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008869 if (encoding == NULL && errors == NULL)
8870 return PyObject_Unicode(x);
8871 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008872 return PyUnicode_FromEncodedObject(x, encoding, errors);
8873}
8874
Guido van Rossume023fe02001-08-30 03:12:59 +00008875static PyObject *
8876unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8877{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008878 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008879 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008880
8881 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8882 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8883 if (tmp == NULL)
8884 return NULL;
8885 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008886 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008887 if (pnew == NULL) {
8888 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008889 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008890 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008891 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8892 if (pnew->str == NULL) {
8893 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008894 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008895 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008896 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008897 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008898 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8899 pnew->length = n;
8900 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008901 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008902 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008903}
8904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008905PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008906"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008907\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008908Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008909encoding defaults to the current default string encoding.\n\
8910errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008911
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008912static PyObject *unicode_iter(PyObject *seq);
8913
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008915 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008916 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917 sizeof(PyUnicodeObject), /* tp_size */
8918 0, /* tp_itemsize */
8919 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008920 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008922 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008924 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008925 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008926 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008928 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929 (hashfunc) unicode_hash, /* tp_hash*/
8930 0, /* tp_call*/
8931 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008932 PyObject_GenericGetAttr, /* tp_getattro */
8933 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008935 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8936 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008937 unicode_doc, /* tp_doc */
8938 0, /* tp_traverse */
8939 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008940 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008941 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008942 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008943 0, /* tp_iternext */
8944 unicode_methods, /* tp_methods */
8945 0, /* tp_members */
8946 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008947 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008948 0, /* tp_dict */
8949 0, /* tp_descr_get */
8950 0, /* tp_descr_set */
8951 0, /* tp_dictoffset */
8952 0, /* tp_init */
8953 0, /* tp_alloc */
8954 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008955 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956};
8957
8958/* Initialize the Unicode implementation */
8959
Thomas Wouters78890102000-07-22 19:25:51 +00008960void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008962 int i;
8963
Thomas Wouters477c8d52006-05-27 19:21:47 +00008964 /* XXX - move this array to unicodectype.c ? */
8965 Py_UNICODE linebreak[] = {
8966 0x000A, /* LINE FEED */
8967 0x000D, /* CARRIAGE RETURN */
8968 0x001C, /* FILE SEPARATOR */
8969 0x001D, /* GROUP SEPARATOR */
8970 0x001E, /* RECORD SEPARATOR */
8971 0x0085, /* NEXT LINE */
8972 0x2028, /* LINE SEPARATOR */
8973 0x2029, /* PARAGRAPH SEPARATOR */
8974 };
8975
Fred Drakee4315f52000-05-09 19:53:39 +00008976 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008977 unicode_freelist = NULL;
8978 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008980 if (!unicode_empty)
8981 return;
8982
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008983 for (i = 0; i < 256; i++)
8984 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008985 if (PyType_Ready(&PyUnicode_Type) < 0)
8986 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008987
8988 /* initialize the linebreak bloom filter */
8989 bloom_linebreak = make_bloom_mask(
8990 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8991 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008992
8993 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994}
8995
8996/* Finalize the Unicode implementation */
8997
8998void
Thomas Wouters78890102000-07-22 19:25:51 +00008999_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009001 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009002 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009004 Py_XDECREF(unicode_empty);
9005 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009006
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009007 for (i = 0; i < 256; i++) {
9008 if (unicode_latin1[i]) {
9009 Py_DECREF(unicode_latin1[i]);
9010 unicode_latin1[i] = NULL;
9011 }
9012 }
9013
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009014 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 PyUnicodeObject *v = u;
9016 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009017 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009018 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009019 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009020 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009022 unicode_freelist = NULL;
9023 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009025
Walter Dörwald16807132007-05-25 13:52:07 +00009026void
9027PyUnicode_InternInPlace(PyObject **p)
9028{
9029 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9030 PyObject *t;
9031 if (s == NULL || !PyUnicode_Check(s))
9032 Py_FatalError(
9033 "PyUnicode_InternInPlace: unicode strings only please!");
9034 /* If it's a subclass, we don't really know what putting
9035 it in the interned dict might do. */
9036 if (!PyUnicode_CheckExact(s))
9037 return;
9038 if (PyUnicode_CHECK_INTERNED(s))
9039 return;
9040 if (interned == NULL) {
9041 interned = PyDict_New();
9042 if (interned == NULL) {
9043 PyErr_Clear(); /* Don't leave an exception */
9044 return;
9045 }
9046 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009047 /* It might be that the GetItem call fails even
9048 though the key is present in the dictionary,
9049 namely when this happens during a stack overflow. */
9050 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009051 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009052 Py_END_ALLOW_RECURSION
9053
Walter Dörwald16807132007-05-25 13:52:07 +00009054 if (t) {
9055 Py_INCREF(t);
9056 Py_DECREF(*p);
9057 *p = t;
9058 return;
9059 }
9060
Martin v. Löwis5b222132007-06-10 09:51:05 +00009061 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009062 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9063 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009064 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009065 return;
9066 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009067 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009068 /* The two references in interned are not counted by refcnt.
9069 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009070 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009071 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9072}
9073
9074void
9075PyUnicode_InternImmortal(PyObject **p)
9076{
9077 PyUnicode_InternInPlace(p);
9078 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9079 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9080 Py_INCREF(*p);
9081 }
9082}
9083
9084PyObject *
9085PyUnicode_InternFromString(const char *cp)
9086{
9087 PyObject *s = PyUnicode_FromString(cp);
9088 if (s == NULL)
9089 return NULL;
9090 PyUnicode_InternInPlace(&s);
9091 return s;
9092}
9093
9094void _Py_ReleaseInternedUnicodeStrings(void)
9095{
9096 PyObject *keys;
9097 PyUnicodeObject *s;
9098 Py_ssize_t i, n;
9099 Py_ssize_t immortal_size = 0, mortal_size = 0;
9100
9101 if (interned == NULL || !PyDict_Check(interned))
9102 return;
9103 keys = PyDict_Keys(interned);
9104 if (keys == NULL || !PyList_Check(keys)) {
9105 PyErr_Clear();
9106 return;
9107 }
9108
9109 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9110 detector, interned unicode strings are not forcibly deallocated;
9111 rather, we give them their stolen references back, and then clear
9112 and DECREF the interned dict. */
9113
9114 n = PyList_GET_SIZE(keys);
9115 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9116 n);
9117 for (i = 0; i < n; i++) {
9118 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9119 switch (s->state) {
9120 case SSTATE_NOT_INTERNED:
9121 /* XXX Shouldn't happen */
9122 break;
9123 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009124 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009125 immortal_size += s->length;
9126 break;
9127 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009128 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009129 mortal_size += s->length;
9130 break;
9131 default:
9132 Py_FatalError("Inconsistent interned string state.");
9133 }
9134 s->state = SSTATE_NOT_INTERNED;
9135 }
9136 fprintf(stderr, "total size of all interned strings: "
9137 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9138 "mortal/immortal\n", mortal_size, immortal_size);
9139 Py_DECREF(keys);
9140 PyDict_Clear(interned);
9141 Py_DECREF(interned);
9142 interned = NULL;
9143}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009144
9145
9146/********************* Unicode Iterator **************************/
9147
9148typedef struct {
9149 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009150 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009151 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9152} unicodeiterobject;
9153
9154static void
9155unicodeiter_dealloc(unicodeiterobject *it)
9156{
9157 _PyObject_GC_UNTRACK(it);
9158 Py_XDECREF(it->it_seq);
9159 PyObject_GC_Del(it);
9160}
9161
9162static int
9163unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9164{
9165 Py_VISIT(it->it_seq);
9166 return 0;
9167}
9168
9169static PyObject *
9170unicodeiter_next(unicodeiterobject *it)
9171{
9172 PyUnicodeObject *seq;
9173 PyObject *item;
9174
9175 assert(it != NULL);
9176 seq = it->it_seq;
9177 if (seq == NULL)
9178 return NULL;
9179 assert(PyUnicode_Check(seq));
9180
9181 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009182 item = PyUnicode_FromUnicode(
9183 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009184 if (item != NULL)
9185 ++it->it_index;
9186 return item;
9187 }
9188
9189 Py_DECREF(seq);
9190 it->it_seq = NULL;
9191 return NULL;
9192}
9193
9194static PyObject *
9195unicodeiter_len(unicodeiterobject *it)
9196{
9197 Py_ssize_t len = 0;
9198 if (it->it_seq)
9199 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9200 return PyInt_FromSsize_t(len);
9201}
9202
9203PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9204
9205static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009206 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9207 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009208 {NULL, NULL} /* sentinel */
9209};
9210
9211PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009212 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009213 "unicodeiterator", /* tp_name */
9214 sizeof(unicodeiterobject), /* tp_basicsize */
9215 0, /* tp_itemsize */
9216 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009217 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009218 0, /* tp_print */
9219 0, /* tp_getattr */
9220 0, /* tp_setattr */
9221 0, /* tp_compare */
9222 0, /* tp_repr */
9223 0, /* tp_as_number */
9224 0, /* tp_as_sequence */
9225 0, /* tp_as_mapping */
9226 0, /* tp_hash */
9227 0, /* tp_call */
9228 0, /* tp_str */
9229 PyObject_GenericGetAttr, /* tp_getattro */
9230 0, /* tp_setattro */
9231 0, /* tp_as_buffer */
9232 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9233 0, /* tp_doc */
9234 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9235 0, /* tp_clear */
9236 0, /* tp_richcompare */
9237 0, /* tp_weaklistoffset */
9238 PyObject_SelfIter, /* tp_iter */
9239 (iternextfunc)unicodeiter_next, /* tp_iternext */
9240 unicodeiter_methods, /* tp_methods */
9241 0,
9242};
9243
9244static PyObject *
9245unicode_iter(PyObject *seq)
9246{
9247 unicodeiterobject *it;
9248
9249 if (!PyUnicode_Check(seq)) {
9250 PyErr_BadInternalCall();
9251 return NULL;
9252 }
9253 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9254 if (it == NULL)
9255 return NULL;
9256 it->it_index = 0;
9257 Py_INCREF(seq);
9258 it->it_seq = (PyUnicodeObject *)seq;
9259 _PyObject_GC_TRACK(it);
9260 return (PyObject *)it;
9261}
9262
Martin v. Löwis5b222132007-06-10 09:51:05 +00009263size_t
9264Py_UNICODE_strlen(const Py_UNICODE *u)
9265{
9266 int res = 0;
9267 while(*u++)
9268 res++;
9269 return res;
9270}
9271
9272Py_UNICODE*
9273Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9274{
9275 Py_UNICODE *u = s1;
9276 while ((*u++ = *s2++));
9277 return s1;
9278}
9279
9280Py_UNICODE*
9281Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9282{
9283 Py_UNICODE *u = s1;
9284 while ((*u++ = *s2++))
9285 if (n-- == 0)
9286 break;
9287 return s1;
9288}
9289
9290int
9291Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9292{
9293 while (*s1 && *s2 && *s1 == *s2)
9294 s1++, s2++;
9295 if (*s1 && *s2)
9296 return (*s1 < *s2) ? -1 : +1;
9297 if (*s1)
9298 return 1;
9299 if (*s2)
9300 return -1;
9301 return 0;
9302}
9303
9304Py_UNICODE*
9305Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9306{
9307 const Py_UNICODE *p;
9308 for (p = s; *p; p++)
9309 if (*p == c)
9310 return (Py_UNICODE*)p;
9311 return NULL;
9312}
9313
9314
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009315#ifdef __cplusplus
9316}
9317#endif
9318
9319
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009320/*
9321Local variables:
9322c-basic-offset: 4
9323indent-tabs-mode: nil
9324End:
9325*/