blob: 3052ebd4c91e1036836a57cba54b95d5af317241 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
120 PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000122
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000124PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000126#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127 return 0x10FFFF;
128#else
129 /* This is actually an illegal character, so it should
130 not be passed to unichr. */
131 return 0xFFFF;
132#endif
133}
134
Thomas Wouters477c8d52006-05-27 19:21:47 +0000135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138 to keep things simple, we use a single bitmask, using the least 5
139 bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154 /* calculate simple bloom-style bitmask for a given unicode string */
155
156 long mask;
157 Py_ssize_t i;
158
159 mask = 0;
160 for (i = 0; i < len; i++)
161 mask |= (1 << (ptr[i] & 0x1F));
162
163 return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168 Py_ssize_t i;
169
170 for (i = 0; i < setlen; i++)
171 if (set[i] == chr)
172 return 1;
173
174 return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
Guido van Rossumd57fd912000-03-10 22:53:23 +0000180/* --- Unicode Object ----------------------------------------------------- */
181
182static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000184 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185{
186 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000187
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000191
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192 /* Resizing shared object (unicode_empty or single character
193 objects) in-place is not allowed. Use PyUnicode_Resize()
194 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000195
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 /* We allocate one more byte to make sure the string is Ux0000 terminated.
206 The overallocation is also used by fastsearch, which assumes that it's
207 safe to look at str[length] (without making any assumptions about what
208 it contains). */
209
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 oldstr = unicode->str;
211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 PyErr_NoMemory();
215 return -1;
216 }
217 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000222 if (unicode->defenc) {
223 Py_DECREF(unicode->defenc);
224 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000227
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 return 0;
229}
230
231/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000232 Ux0000 terminated; some code (e.g. new_identifier)
233 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234
235 XXX This allocator could further be enhanced by assuring that the
236 free list never reduces its size below 1.
237
238*/
239
240static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
243 register PyUnicodeObject *unicode;
244
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 if (length == 0 && unicode_empty != NULL) {
247 Py_INCREF(unicode_empty);
248 return unicode_empty;
249 }
250
251 /* Unicode freelist & memory allocation */
252 if (unicode_freelist) {
253 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000254 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 /* Keep-Alive optimization: we only upsize the buffer,
258 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000259 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000260 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000267 }
268 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 }
270 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 if (unicode == NULL)
273 return NULL;
274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000292 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000293 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000295
296 onError:
297 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000298 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300}
301
302static
Guido van Rossum9475a232001-10-05 20:51:39 +0000303void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304{
Walter Dörwald16807132007-05-25 13:52:07 +0000305 switch (PyUnicode_CHECK_INTERNED(unicode)) {
306 case SSTATE_NOT_INTERNED:
307 break;
308
309 case SSTATE_INTERNED_MORTAL:
310 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000311 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313 Py_FatalError(
314 "deletion of interned unicode string failed");
315 break;
316
317 case SSTATE_INTERNED_IMMORTAL:
318 Py_FatalError("Immortal interned unicode string died.");
319
320 default:
321 Py_FatalError("Inconsistent interned unicode string state.");
322 }
323
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 if (PyUnicode_CheckExact(unicode) &&
325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000326 /* Keep-Alive optimization */
327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000328 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 unicode->str = NULL;
330 unicode->length = 0;
331 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000332 if (unicode->defenc) {
333 Py_DECREF(unicode->defenc);
334 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000335 }
336 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 *(PyUnicodeObject **)unicode = unicode_freelist;
338 unicode_freelist = unicode;
339 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 }
341 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000342 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000343 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000344 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346}
347
Martin v. Löwis18e16552006-02-15 17:27:45 +0000348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349{
350 register PyUnicodeObject *v;
351
352 /* Argument checks */
353 if (unicode == NULL) {
354 PyErr_BadInternalCall();
355 return -1;
356 }
357 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 PyErr_BadInternalCall();
360 return -1;
361 }
362
363 /* Resizing unicode_empty and single character objects is not
364 possible since these are being shared. We simply return a fresh
365 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000366 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 (v == unicode_empty || v->length == 1)) {
368 PyUnicodeObject *w = _PyUnicode_New(length);
369 if (w == NULL)
370 return -1;
371 Py_UNICODE_COPY(w->str, v->str,
372 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000373 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000374 *unicode = (PyObject *)w;
375 return 0;
376 }
377
378 /* Note that we don't have to modify *unicode for unshared Unicode
379 objects, since we can modify them in-place. */
380 return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
390 PyUnicodeObject *unicode;
391
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000392 /* If the Unicode data is known at construction time, we can apply
393 some optimizations which share commonly used objects. */
394 if (u != NULL) {
395
396 /* Optimization for empty strings */
397 if (size == 0 && unicode_empty != NULL) {
398 Py_INCREF(unicode_empty);
399 return (PyObject *)unicode_empty;
400 }
401
402 /* Single character Unicode objects in the Latin-1 range are
403 shared when using this constructor */
404 if (size == 1 && *u < 256) {
405 unicode = unicode_latin1[*u];
406 if (!unicode) {
407 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 if (!unicode)
409 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000410 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 unicode_latin1[*u] = unicode;
412 }
413 Py_INCREF(unicode);
414 return (PyObject *)unicode;
415 }
416 }
Tim Petersced69f82003-09-16 20:30:58 +0000417
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 unicode = _PyUnicode_New(size);
419 if (!unicode)
420 return NULL;
421
422 /* Copy the Unicode data into the new object */
423 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425
426 return (PyObject *)unicode;
427}
428
Walter Dörwaldd2034312007-05-18 16:29:38 +0000429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430{
431 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000433 some optimizations which share commonly used objects.
434 Also, this means the input must be UTF-8, so fall back to the
435 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 if (u != NULL) {
437
438 /* Optimization for empty strings */
439 if (size == 0 && unicode_empty != NULL) {
440 Py_INCREF(unicode_empty);
441 return (PyObject *)unicode_empty;
442 }
443
Martin v. Löwis9c121062007-08-05 20:26:11 +0000444 /* Single characters are shared when using this constructor.
445 Restrict to ASCII, since the input must be UTF-8. */
446 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000447 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 if (!unicode) {
449 unicode = _PyUnicode_New(1);
450 if (!unicode)
451 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode->str[0] = Py_CHARMASK(*u);
453 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000454 }
455 Py_INCREF(unicode);
456 return (PyObject *)unicode;
457 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000458
459 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 }
461
Walter Dörwald55507312007-05-18 13:12:10 +0000462 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 if (!unicode)
464 return NULL;
465
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000466 return (PyObject *)unicode;
467}
468
Walter Dörwaldd2034312007-05-18 16:29:38 +0000469PyObject *PyUnicode_FromString(const char *u)
470{
471 size_t size = strlen(u);
472 if (size > PY_SSIZE_T_MAX) {
473 PyErr_SetString(PyExc_OverflowError, "input too long");
474 return NULL;
475 }
476
477 return PyUnicode_FromStringAndSize(u, size);
478}
479
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000483 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484{
485 PyUnicodeObject *unicode;
486
487 if (w == NULL) {
488 PyErr_BadInternalCall();
489 return NULL;
490 }
491
492 unicode = _PyUnicode_New(size);
493 if (!unicode)
494 return NULL;
495
496 /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000499#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 {
501 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000504 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 *u++ = *w++;
506 }
507#endif
508
509 return (PyObject *)unicode;
510}
511
Walter Dörwald346737f2007-05-31 10:44:43 +0000512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515 *fmt++ = '%';
516 if (width) {
517 if (zeropad)
518 *fmt++ = '0';
519 fmt += sprintf(fmt, "%d", width);
520 }
521 if (precision)
522 fmt += sprintf(fmt, ".%d", precision);
523 if (longflag)
524 *fmt++ = 'l';
525 else if (size_tflag) {
526 char *f = PY_FORMAT_SIZE_T;
527 while (*f)
528 *fmt++ = *f++;
529 }
530 *fmt++ = c;
531 *fmt = '\0';
532}
533
Walter Dörwaldd2034312007-05-18 16:29:38 +0000534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000540 Py_ssize_t callcount = 0;
541 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000542 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000543 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000544 int width = 0;
545 int precision = 0;
546 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 const char* f;
548 Py_UNICODE *s;
549 PyObject *string;
550 /* used by sprintf */
551 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000552 /* use abuffer instead of buffer, if we need more space
553 * (which can happen if there's a format specifier with width). */
554 char *abuffer = NULL;
555 char *realbuffer;
556 Py_ssize_t abuffersize = 0;
557 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000558 const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561 Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef __va_copy
564 __va_copy(count, vargs);
565#else
566 count = vargs;
567#endif
568#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000569 /* step 1: count the number of %S/%R format specifications
570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000574 ++callcount;
575 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 /* step 2: allocate memory for the results of
577 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 if (callcount) {
579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580 if (!callresults) {
581 PyErr_NoMemory();
582 return NULL;
583 }
584 callresult = callresults;
585 }
586 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000587 for (f = format; *f; f++) {
588 if (*f == '%') {
589 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000590 width = 0;
591 while (isdigit(Py_CHARMASK(*f)))
592 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594 ;
595
596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597 * they don't affect the amount of space we reserve.
598 */
599 if ((*f == 'l' || *f == 'z') &&
600 (f[1] == 'd' || f[1] == 'u'))
601 ++f;
602
603 switch (*f) {
604 case 'c':
605 (void)va_arg(count, int);
606 /* fall through... */
607 case '%':
608 n++;
609 break;
610 case 'd': case 'u': case 'i': case 'x':
611 (void) va_arg(count, int);
612 /* 20 bytes is enough to hold a 64-bit
613 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000614 This isn't enough for octal.
615 If a width is specified we need more
616 (which we allocate later). */
617 if (width < 20)
618 width = 20;
619 n += width;
620 if (abuffersize < width)
621 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 break;
623 case 's':
624 n += strlen(va_arg(count, char*));
625 break;
626 case 'U':
627 {
628 PyObject *obj = va_arg(count, PyObject *);
629 assert(obj && PyUnicode_Check(obj));
630 n += PyUnicode_GET_SIZE(obj);
631 break;
632 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000633 case 'V':
634 {
635 PyObject *obj = va_arg(count, PyObject *);
636 const char *str = va_arg(count, const char *);
637 assert(obj || str);
638 assert(!obj || PyUnicode_Check(obj));
639 if (obj)
640 n += PyUnicode_GET_SIZE(obj);
641 else
642 n += strlen(str);
643 break;
644 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000645 case 'S':
646 {
647 PyObject *obj = va_arg(count, PyObject *);
648 PyObject *str;
649 assert(obj);
650 str = PyObject_Unicode(obj);
651 if (!str)
652 goto fail;
653 n += PyUnicode_GET_SIZE(str);
654 /* Remember the str and switch to the next slot */
655 *callresult++ = str;
656 break;
657 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000658 case 'R':
659 {
660 PyObject *obj = va_arg(count, PyObject *);
661 PyObject *repr;
662 assert(obj);
663 repr = PyObject_Repr(obj);
664 if (!repr)
665 goto fail;
666 n += PyUnicode_GET_SIZE(repr);
667 /* Remember the repr and switch to the next slot */
668 *callresult++ = repr;
669 break;
670 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000671 case 'p':
672 (void) va_arg(count, int);
673 /* maximum 64-bit pointer representation:
674 * 0xffffffffffffffff
675 * so 19 characters is enough.
676 * XXX I count 18 -- what's the extra for?
677 */
678 n += 19;
679 break;
680 default:
681 /* if we stumble upon an unknown
682 formatting code, copy the rest of
683 the format string to the output
684 string. (we cannot just skip the
685 code, since there's no way to know
686 what's in the argument list) */
687 n += strlen(p);
688 goto expand;
689 }
690 } else
691 n++;
692 }
693 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000694 if (abuffersize > 20) {
695 abuffer = PyMem_Malloc(abuffersize);
696 if (!abuffer) {
697 PyErr_NoMemory();
698 goto fail;
699 }
700 realbuffer = abuffer;
701 }
702 else
703 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000706 we don't have to resize the string.
707 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708 string = PyUnicode_FromUnicode(NULL, n);
709 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000710 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000713 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714
715 for (f = format; *f; f++) {
716 if (*f == '%') {
717 const char* p = f++;
718 int longflag = 0;
719 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000720 zeropad = (*f == '0');
721 /* parse the width.precision part */
722 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000724 width = (width*10) + *f++ - '0';
725 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 if (*f == '.') {
727 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000729 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000730 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 /* handle the long flag, but only for %ld and %lu.
732 others can be added when necessary. */
733 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
734 longflag = 1;
735 ++f;
736 }
737 /* handle the size_t flag. */
738 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
739 size_tflag = 1;
740 ++f;
741 }
742
743 switch (*f) {
744 case 'c':
745 *s++ = va_arg(vargs, int);
746 break;
747 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000754 sprintf(realbuffer, fmt, va_arg(vargs, int));
755 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 break;
757 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000764 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
765 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 break;
767 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000768 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
769 sprintf(realbuffer, fmt, va_arg(vargs, int));
770 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000771 break;
772 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000773 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
774 sprintf(realbuffer, fmt, va_arg(vargs, int));
775 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000776 break;
777 case 's':
778 p = va_arg(vargs, char*);
779 appendstring(p);
780 break;
781 case 'U':
782 {
783 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000784 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
785 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
786 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 break;
788 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000789 case 'V':
790 {
791 PyObject *obj = va_arg(vargs, PyObject *);
792 const char *str = va_arg(vargs, const char *);
793 if (obj) {
794 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
795 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
796 s += size;
797 } else {
798 appendstring(str);
799 }
800 break;
801 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000802 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000803 case 'R':
804 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000805 Py_UNICODE *ucopy;
806 Py_ssize_t usize;
807 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000808 /* unused, since we already have the result */
809 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000810 ucopy = PyUnicode_AS_UNICODE(*callresult);
811 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 for (upos = 0; upos<usize;)
813 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000816 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000817 ++callresult;
818 break;
819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820 case 'p':
821 sprintf(buffer, "%p", va_arg(vargs, void*));
822 /* %p is ill-defined: ensure leading 0x. */
823 if (buffer[1] == 'X')
824 buffer[1] = 'x';
825 else if (buffer[1] != 'x') {
826 memmove(buffer+2, buffer, strlen(buffer)+1);
827 buffer[0] = '0';
828 buffer[1] = 'x';
829 }
830 appendstring(buffer);
831 break;
832 case '%':
833 *s++ = '%';
834 break;
835 default:
836 appendstring(p);
837 goto end;
838 }
839 } else
840 *s++ = *f;
841 }
842
843 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 if (callresults)
845 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000846 if (abuffer)
847 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
849 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000850 fail:
851 if (callresults) {
852 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000853 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000854 Py_DECREF(*callresult2);
855 ++callresult2;
856 }
857 PyMem_Free(callresults);
858 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 if (abuffer)
860 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862}
863
864#undef appendstring
865
866PyObject *
867PyUnicode_FromFormat(const char *format, ...)
868{
869 PyObject* ret;
870 va_list vargs;
871
872#ifdef HAVE_STDARG_PROTOTYPES
873 va_start(vargs, format);
874#else
875 va_start(vargs);
876#endif
877 ret = PyUnicode_FromFormatV(format, vargs);
878 va_end(vargs);
879 return ret;
880}
881
Martin v. Löwis18e16552006-02-15 17:27:45 +0000882Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
883 wchar_t *w,
884 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
886 if (unicode == NULL) {
887 PyErr_BadInternalCall();
888 return -1;
889 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890
891 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000893 size = PyUnicode_GET_SIZE(unicode) + 1;
894
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895#ifdef HAVE_USABLE_WCHAR_T
896 memcpy(w, unicode->str, size * sizeof(wchar_t));
897#else
898 {
899 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000900 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000902 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 *w++ = *u++;
904 }
905#endif
906
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000907 if (size > PyUnicode_GET_SIZE(unicode))
908 return PyUnicode_GET_SIZE(unicode);
909 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000910 return size;
911}
912
913#endif
914
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915PyObject *PyUnicode_FromOrdinal(int ordinal)
916{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000917 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000918
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 if (ordinal < 0 || ordinal > 0x10ffff) {
920 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000922 return NULL;
923 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000924
925#ifndef Py_UNICODE_WIDE
926 if (ordinal > 0xffff) {
927 ordinal -= 0x10000;
928 s[0] = 0xD800 | (ordinal >> 10);
929 s[1] = 0xDC00 | (ordinal & 0x3FF);
930 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000931 }
932#endif
933
Hye-Shik Chang40574832004-04-06 07:24:51 +0000934 s[0] = (Py_UNICODE)ordinal;
935 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000936}
937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938PyObject *PyUnicode_FromObject(register PyObject *obj)
939{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000940 /* XXX Perhaps we should make this API an alias of
941 PyObject_Unicode() instead ?! */
942 if (PyUnicode_CheckExact(obj)) {
943 Py_INCREF(obj);
944 return obj;
945 }
946 if (PyUnicode_Check(obj)) {
947 /* For a Unicode subtype that's not a Unicode object,
948 return a true Unicode object with the same data. */
949 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
950 PyUnicode_GET_SIZE(obj));
951 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000952 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
953}
954
955PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
956 const char *encoding,
957 const char *errors)
958{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000959 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000961 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000962
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 if (obj == NULL) {
964 PyErr_BadInternalCall();
965 return NULL;
966 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000967
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000968#if 0
969 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000970 that no encodings is given and then redirect to
971 PyObject_Unicode() which then applies the additional logic for
972 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000973
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000974 NOTE: This API should really only be used for object which
975 represent *encoded* Unicode !
976
977 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000978 if (PyUnicode_Check(obj)) {
979 if (encoding) {
980 PyErr_SetString(PyExc_TypeError,
981 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000982 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000985 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000986#else
987 if (PyUnicode_Check(obj)) {
988 PyErr_SetString(PyExc_TypeError,
989 "decoding Unicode is not supported");
990 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000991 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000992#endif
993
994 /* Coerce object */
995 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000996 s = PyString_AS_STRING(obj);
997 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000998 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000999 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1000 /* Overwrite the error message with something more useful in
1001 case of a TypeError. */
1002 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001003 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001004 "coercing to Unicode: need string or buffer, "
1005 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001006 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001007 goto onError;
1008 }
Tim Petersced69f82003-09-16 20:30:58 +00001009
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 if (len == 0) {
1012 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 }
Tim Petersced69f82003-09-16 20:30:58 +00001015 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001017
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 return v;
1019
1020 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022}
1023
1024PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001025 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 const char *encoding,
1027 const char *errors)
1028{
1029 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001030
1031 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001032 encoding = PyUnicode_GetDefaultEncoding();
1033
1034 /* Shortcuts for common default encodings */
1035 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001037 else if (strcmp(encoding, "latin-1") == 0)
1038 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001039#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1040 else if (strcmp(encoding, "mbcs") == 0)
1041 return PyUnicode_DecodeMBCS(s, size, errors);
1042#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001043 else if (strcmp(encoding, "ascii") == 0)
1044 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045
1046 /* Decode via the codec registry */
1047 buffer = PyBuffer_FromMemory((void *)s, size);
1048 if (buffer == NULL)
1049 goto onError;
1050 unicode = PyCodec_Decode(buffer, encoding, errors);
1051 if (unicode == NULL)
1052 goto onError;
1053 if (!PyUnicode_Check(unicode)) {
1054 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001055 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001056 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 Py_DECREF(unicode);
1058 goto onError;
1059 }
1060 Py_DECREF(buffer);
1061 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063 onError:
1064 Py_XDECREF(buffer);
1065 return NULL;
1066}
1067
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001068PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1069 const char *encoding,
1070 const char *errors)
1071{
1072 PyObject *v;
1073
1074 if (!PyUnicode_Check(unicode)) {
1075 PyErr_BadArgument();
1076 goto onError;
1077 }
1078
1079 if (encoding == NULL)
1080 encoding = PyUnicode_GetDefaultEncoding();
1081
1082 /* Decode via the codec registry */
1083 v = PyCodec_Decode(unicode, encoding, errors);
1084 if (v == NULL)
1085 goto onError;
1086 return v;
1087
1088 onError:
1089 return NULL;
1090}
1091
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001093 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 const char *encoding,
1095 const char *errors)
1096{
1097 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001098
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 unicode = PyUnicode_FromUnicode(s, size);
1100 if (unicode == NULL)
1101 return NULL;
1102 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1103 Py_DECREF(unicode);
1104 return v;
1105}
1106
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001107PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1108 const char *encoding,
1109 const char *errors)
1110{
1111 PyObject *v;
1112
1113 if (!PyUnicode_Check(unicode)) {
1114 PyErr_BadArgument();
1115 goto onError;
1116 }
1117
1118 if (encoding == NULL)
1119 encoding = PyUnicode_GetDefaultEncoding();
1120
1121 /* Encode via the codec registry */
1122 v = PyCodec_Encode(unicode, encoding, errors);
1123 if (v == NULL)
1124 goto onError;
1125 return v;
1126
1127 onError:
1128 return NULL;
1129}
1130
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1132 const char *encoding,
1133 const char *errors)
1134{
1135 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137 if (!PyUnicode_Check(unicode)) {
1138 PyErr_BadArgument();
1139 goto onError;
1140 }
Fred Drakee4315f52000-05-09 19:53:39 +00001141
Tim Petersced69f82003-09-16 20:30:58 +00001142 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001143 encoding = PyUnicode_GetDefaultEncoding();
1144
1145 /* Shortcuts for common default encodings */
1146 if (errors == NULL) {
1147 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001148 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001149 else if (strcmp(encoding, "latin-1") == 0)
1150 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001151#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1152 else if (strcmp(encoding, "mbcs") == 0)
1153 return PyUnicode_AsMBCSString(unicode);
1154#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001155 else if (strcmp(encoding, "ascii") == 0)
1156 return PyUnicode_AsASCIIString(unicode);
1157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158
1159 /* Encode via the codec registry */
1160 v = PyCodec_Encode(unicode, encoding, errors);
1161 if (v == NULL)
1162 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001163 if (!PyBytes_Check(v)) {
1164 if (PyString_Check(v)) {
1165 /* Old codec, turn it into bytes */
1166 PyObject *b = PyBytes_FromObject(v);
1167 Py_DECREF(v);
1168 return b;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001171 "encoder did not return a bytes object "
1172 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1173 v->ob_type->tp_name,
1174 encoding ? encoding : "NULL",
1175 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 Py_DECREF(v);
1177 goto onError;
1178 }
1179 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001180
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 onError:
1182 return NULL;
1183}
1184
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001185PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1186 const char *errors)
1187{
1188 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001189 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001190 if (v)
1191 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001192 if (errors != NULL)
1193 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001194 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1195 PyUnicode_GET_SIZE(unicode),
1196 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001197 if (!b)
1198 return NULL;
1199 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1200 PyBytes_Size(b));
1201 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001202 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001203 return v;
1204}
1205
Martin v. Löwis5b222132007-06-10 09:51:05 +00001206char*
1207PyUnicode_AsString(PyObject *unicode)
1208{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_BadArgument();
1211 return NULL;
1212 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001213 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1214 if (!unicode)
1215 return NULL;
1216 return PyString_AsString(unicode);
1217}
1218
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1220{
1221 if (!PyUnicode_Check(unicode)) {
1222 PyErr_BadArgument();
1223 goto onError;
1224 }
1225 return PyUnicode_AS_UNICODE(unicode);
1226
1227 onError:
1228 return NULL;
1229}
1230
Martin v. Löwis18e16552006-02-15 17:27:45 +00001231Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232{
1233 if (!PyUnicode_Check(unicode)) {
1234 PyErr_BadArgument();
1235 goto onError;
1236 }
1237 return PyUnicode_GET_SIZE(unicode);
1238
1239 onError:
1240 return -1;
1241}
1242
Thomas Wouters78890102000-07-22 19:25:51 +00001243const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001244{
1245 return unicode_default_encoding;
1246}
1247
1248int PyUnicode_SetDefaultEncoding(const char *encoding)
1249{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001250 if (strcmp(encoding, unicode_default_encoding) != 0) {
1251 PyErr_Format(PyExc_ValueError,
1252 "Can only set default encoding to %s",
1253 unicode_default_encoding);
1254 return -1;
1255 }
Fred Drakee4315f52000-05-09 19:53:39 +00001256 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001257}
1258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001259/* error handling callback helper:
1260 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001261 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262 and adjust various state variables.
1263 return 0 on success, -1 on error
1264*/
1265
1266static
1267int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1268 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001269 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001270 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001271{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001272 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273
1274 PyObject *restuple = NULL;
1275 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001277 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001278 Py_ssize_t requiredsize;
1279 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001281 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001282 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001283 int res = -1;
1284
1285 if (*errorHandler == NULL) {
1286 *errorHandler = PyCodec_LookupError(errors);
1287 if (*errorHandler == NULL)
1288 goto onError;
1289 }
1290
1291 if (*exceptionObject == NULL) {
1292 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001293 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001294 if (*exceptionObject == NULL)
1295 goto onError;
1296 }
1297 else {
1298 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1299 goto onError;
1300 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1301 goto onError;
1302 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1303 goto onError;
1304 }
1305
1306 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1307 if (restuple == NULL)
1308 goto onError;
1309 if (!PyTuple_Check(restuple)) {
1310 PyErr_Format(PyExc_TypeError, &argparse[4]);
1311 goto onError;
1312 }
1313 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1314 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001315
1316 /* Copy back the bytes variables, which might have been modified by the
1317 callback */
1318 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1319 if (!inputobj)
1320 goto onError;
1321 if (!PyBytes_Check(inputobj)) {
1322 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1323 }
1324 *input = PyBytes_AS_STRING(inputobj);
1325 insize = PyBytes_GET_SIZE(inputobj);
1326 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001327 /* we can DECREF safely, as the exception has another reference,
1328 so the object won't go away. */
1329 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001332 newpos = insize+newpos;
1333 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001334 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001335 goto onError;
1336 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337
1338 /* need more space? (at least enough for what we
1339 have+the replacement+the rest of the string (starting
1340 at the new input position), so we won't have to check space
1341 when there are no errors in the rest of the string) */
1342 repptr = PyUnicode_AS_UNICODE(repunicode);
1343 repsize = PyUnicode_GET_SIZE(repunicode);
1344 requiredsize = *outpos + repsize + insize-newpos;
1345 if (requiredsize > outsize) {
1346 if (requiredsize<2*outsize)
1347 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001348 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349 goto onError;
1350 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1351 }
1352 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001353 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 Py_UNICODE_COPY(*outptr, repptr, repsize);
1355 *outptr += repsize;
1356 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001358 /* we made it! */
1359 res = 0;
1360
1361 onError:
1362 Py_XDECREF(restuple);
1363 return res;
1364}
1365
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001366/* --- UTF-7 Codec -------------------------------------------------------- */
1367
1368/* see RFC2152 for details */
1369
Tim Petersced69f82003-09-16 20:30:58 +00001370static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001371char utf7_special[128] = {
1372 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1373 encoded:
1374 0 - not special
1375 1 - special
1376 2 - whitespace (optional)
1377 3 - RFC2152 Set O (optional) */
1378 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1379 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1380 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1382 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1384 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1386
1387};
1388
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001389/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1390 warnings about the comparison always being false; since
1391 utf7_special[0] is 1, we can safely make that one comparison
1392 true */
1393
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001394#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001395 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001396 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001397 (encodeO && (utf7_special[(c)] == 3)))
1398
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001399#define B64(n) \
1400 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1401#define B64CHAR(c) \
1402 (isalnum(c) || (c) == '+' || (c) == '/')
1403#define UB64(c) \
1404 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1405 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001406
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001407#define ENCODE(out, ch, bits) \
1408 while (bits >= 6) { \
1409 *out++ = B64(ch >> (bits-6)); \
1410 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001411 }
1412
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001413#define DECODE(out, ch, bits, surrogate) \
1414 while (bits >= 16) { \
1415 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1416 bits -= 16; \
1417 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001418 /* We have already generated an error for the high surrogate \
1419 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001420 surrogate = 0; \
1421 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001422 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001423 it in a 16-bit character */ \
1424 surrogate = 1; \
1425 errmsg = "code pairs are not supported"; \
1426 goto utf7Error; \
1427 } else { \
1428 *out++ = outCh; \
1429 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001430 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001432PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001433 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434 const char *errors)
1435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001436 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001437 Py_ssize_t startinpos;
1438 Py_ssize_t endinpos;
1439 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001440 const char *e;
1441 PyUnicodeObject *unicode;
1442 Py_UNICODE *p;
1443 const char *errmsg = "";
1444 int inShift = 0;
1445 unsigned int bitsleft = 0;
1446 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001447 int surrogate = 0;
1448 PyObject *errorHandler = NULL;
1449 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001450
1451 unicode = _PyUnicode_New(size);
1452 if (!unicode)
1453 return NULL;
1454 if (size == 0)
1455 return (PyObject *)unicode;
1456
1457 p = unicode->str;
1458 e = s + size;
1459
1460 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001461 Py_UNICODE ch;
1462 restart:
1463 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001464
1465 if (inShift) {
1466 if ((ch == '-') || !B64CHAR(ch)) {
1467 inShift = 0;
1468 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001469
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001470 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1471 if (bitsleft >= 6) {
1472 /* The shift sequence has a partial character in it. If
1473 bitsleft < 6 then we could just classify it as padding
1474 but that is not the case here */
1475
1476 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001477 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478 }
1479 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001480 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 here so indicate the potential of a misencoded character. */
1482
1483 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1484 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1485 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001486 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001487 }
1488
1489 if (ch == '-') {
1490 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001491 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492 inShift = 1;
1493 }
1494 } else if (SPECIAL(ch,0,0)) {
1495 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001496 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001497 } else {
1498 *p++ = ch;
1499 }
1500 } else {
1501 charsleft = (charsleft << 6) | UB64(ch);
1502 bitsleft += 6;
1503 s++;
1504 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1505 }
1506 }
1507 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001508 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 s++;
1510 if (s < e && *s == '-') {
1511 s++;
1512 *p++ = '+';
1513 } else
1514 {
1515 inShift = 1;
1516 bitsleft = 0;
1517 }
1518 }
1519 else if (SPECIAL(ch,0,0)) {
1520 errmsg = "unexpected special character";
1521 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001522 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001523 }
1524 else {
1525 *p++ = ch;
1526 s++;
1527 }
1528 continue;
1529 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 outpos = p-PyUnicode_AS_UNICODE(unicode);
1531 endinpos = s-starts;
1532 if (unicode_decode_call_errorhandler(
1533 errors, &errorHandler,
1534 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001535 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 (PyObject **)&unicode, &outpos, &p))
1537 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 }
1539
1540 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 outpos = p-PyUnicode_AS_UNICODE(unicode);
1542 endinpos = size;
1543 if (unicode_decode_call_errorhandler(
1544 errors, &errorHandler,
1545 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001546 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 if (s < e)
1550 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 }
1552
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001553 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554 goto onError;
1555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 Py_XDECREF(errorHandler);
1557 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 return (PyObject *)unicode;
1559
1560onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001561 Py_XDECREF(errorHandler);
1562 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 Py_DECREF(unicode);
1564 return NULL;
1565}
1566
1567
1568PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001569 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570 int encodeSetO,
1571 int encodeWhiteSpace,
1572 const char *errors)
1573{
1574 PyObject *v;
1575 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001576 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 unsigned int bitsleft = 0;
1580 unsigned long charsleft = 0;
1581 char * out;
1582 char * start;
1583
1584 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001585 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001586
Walter Dörwald51ab4142007-05-05 14:43:36 +00001587 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588 if (v == NULL)
1589 return NULL;
1590
Walter Dörwald51ab4142007-05-05 14:43:36 +00001591 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592 for (;i < size; ++i) {
1593 Py_UNICODE ch = s[i];
1594
1595 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001596 if (ch == '+') {
1597 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598 *out++ = '-';
1599 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1600 charsleft = ch;
1601 bitsleft = 16;
1602 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001603 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001605 } else {
1606 *out++ = (char) ch;
1607 }
1608 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001609 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1610 *out++ = B64(charsleft << (6-bitsleft));
1611 charsleft = 0;
1612 bitsleft = 0;
1613 /* Characters not in the BASE64 set implicitly unshift the sequence
1614 so no '-' is required, except if the character is itself a '-' */
1615 if (B64CHAR(ch) || ch == '-') {
1616 *out++ = '-';
1617 }
1618 inShift = 0;
1619 *out++ = (char) ch;
1620 } else {
1621 bitsleft += 16;
1622 charsleft = (charsleft << 16) | ch;
1623 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1624
1625 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001626 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 or '-' then the shift sequence will be terminated implicitly and we
1628 don't have to insert a '-'. */
1629
1630 if (bitsleft == 0) {
1631 if (i + 1 < size) {
1632 Py_UNICODE ch2 = s[i+1];
1633
1634 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001635
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 } else if (B64CHAR(ch2) || ch2 == '-') {
1637 *out++ = '-';
1638 inShift = 0;
1639 } else {
1640 inShift = 0;
1641 }
1642
1643 }
1644 else {
1645 *out++ = '-';
1646 inShift = 0;
1647 }
1648 }
Tim Petersced69f82003-09-16 20:30:58 +00001649 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001651 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 if (bitsleft) {
1653 *out++= B64(charsleft << (6-bitsleft) );
1654 *out++ = '-';
1655 }
1656
Walter Dörwald51ab4142007-05-05 14:43:36 +00001657 if (PyBytes_Resize(v, out - start)) {
1658 Py_DECREF(v);
1659 return NULL;
1660 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001661 return v;
1662}
1663
1664#undef SPECIAL
1665#undef B64
1666#undef B64CHAR
1667#undef UB64
1668#undef ENCODE
1669#undef DECODE
1670
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671/* --- UTF-8 Codec -------------------------------------------------------- */
1672
Tim Petersced69f82003-09-16 20:30:58 +00001673static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674char utf8_code_length[256] = {
1675 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1676 illegal prefix. see RFC 2279 for details */
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1684 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1689 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1690 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1691 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1692 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1693};
1694
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001696 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697 const char *errors)
1698{
Walter Dörwald69652032004-09-07 20:24:22 +00001699 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1700}
1701
1702PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001703 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001704 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001705 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001706{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001707 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001709 Py_ssize_t startinpos;
1710 Py_ssize_t endinpos;
1711 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 const char *e;
1713 PyUnicodeObject *unicode;
1714 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001716 PyObject *errorHandler = NULL;
1717 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718
1719 /* Note: size will always be longer than the resulting Unicode
1720 character count */
1721 unicode = _PyUnicode_New(size);
1722 if (!unicode)
1723 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001724 if (size == 0) {
1725 if (consumed)
1726 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729
1730 /* Unpack UTF-8 encoded data */
1731 p = unicode->str;
1732 e = s + size;
1733
1734 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001735 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736
1737 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001738 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 s++;
1740 continue;
1741 }
1742
1743 n = utf8_code_length[ch];
1744
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001745 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001746 if (consumed)
1747 break;
1748 else {
1749 errmsg = "unexpected end of data";
1750 startinpos = s-starts;
1751 endinpos = size;
1752 goto utf8Error;
1753 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755
1756 switch (n) {
1757
1758 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001759 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 startinpos = s-starts;
1761 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001762 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763
1764 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001765 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001766 startinpos = s-starts;
1767 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001768 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769
1770 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 if ((s[1] & 0xc0) != 0x80) {
1772 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 startinpos = s-starts;
1774 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001775 goto utf8Error;
1776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 startinpos = s-starts;
1780 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001781 errmsg = "illegal encoding";
1782 goto utf8Error;
1783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001785 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 break;
1787
1788 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001789 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001790 (s[2] & 0xc0) != 0x80) {
1791 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 startinpos = s-starts;
1793 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001794 goto utf8Error;
1795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001797 if (ch < 0x0800) {
1798 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001799 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001800
1801 XXX For wide builds (UCS-4) we should probably try
1802 to recombine the surrogates into a single code
1803 unit.
1804 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001805 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 startinpos = s-starts;
1807 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001808 goto utf8Error;
1809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001811 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001812 break;
1813
1814 case 4:
1815 if ((s[1] & 0xc0) != 0x80 ||
1816 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 (s[3] & 0xc0) != 0x80) {
1818 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 startinpos = s-starts;
1820 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 goto utf8Error;
1822 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001823 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1824 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1825 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001826 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001827 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001828 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001829 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001830 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 startinpos = s-starts;
1833 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 goto utf8Error;
1835 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001836#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001837 *p++ = (Py_UNICODE)ch;
1838#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001839 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001840
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001841 /* translate from 10000..10FFFF to 0..FFFF */
1842 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001843
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001844 /* high surrogate = top 10 bits added to D800 */
1845 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001846
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001847 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001848 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001849#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 break;
1851
1852 default:
1853 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001854 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001855 startinpos = s-starts;
1856 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 }
1859 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001861
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001862 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 outpos = p-PyUnicode_AS_UNICODE(unicode);
1864 if (unicode_decode_call_errorhandler(
1865 errors, &errorHandler,
1866 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001867 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 (PyObject **)&unicode, &outpos, &p))
1869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 }
Walter Dörwald69652032004-09-07 20:24:22 +00001871 if (consumed)
1872 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
1874 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001875 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 goto onError;
1877
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 Py_XDECREF(errorHandler);
1879 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 return (PyObject *)unicode;
1881
1882onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 Py_XDECREF(errorHandler);
1884 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 Py_DECREF(unicode);
1886 return NULL;
1887}
1888
Tim Peters602f7402002-04-27 18:03:26 +00001889/* Allocation strategy: if the string is short, convert into a stack buffer
1890 and allocate exactly as much space needed at the end. Else allocate the
1891 maximum possible needed (4 result bytes per Unicode character), and return
1892 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001893*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001894PyObject *
1895PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001896 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001897 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898{
Tim Peters602f7402002-04-27 18:03:26 +00001899#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001900
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001902 PyObject *v; /* result string object */
1903 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001904 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001905 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001906 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001907
Tim Peters602f7402002-04-27 18:03:26 +00001908 assert(s != NULL);
1909 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910
Tim Peters602f7402002-04-27 18:03:26 +00001911 if (size <= MAX_SHORT_UNICHARS) {
1912 /* Write into the stack buffer; nallocated can't overflow.
1913 * At the end, we'll allocate exactly as much heap space as it
1914 * turns out we need.
1915 */
1916 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1917 v = NULL; /* will allocate after we're done */
1918 p = stackbuf;
1919 }
1920 else {
1921 /* Overallocate on the heap, and give the excess back at the end. */
1922 nallocated = size * 4;
1923 if (nallocated / 4 != size) /* overflow! */
1924 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001925 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001926 if (v == NULL)
1927 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001928 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001929 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001930
Tim Peters602f7402002-04-27 18:03:26 +00001931 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001932 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001933
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001934 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001935 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001937
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001939 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001940 *p++ = (char)(0xc0 | (ch >> 6));
1941 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001942 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001943 else {
Tim Peters602f7402002-04-27 18:03:26 +00001944 /* Encode UCS2 Unicode ordinals */
1945 if (ch < 0x10000) {
1946 /* Special case: check for high surrogate */
1947 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1948 Py_UCS4 ch2 = s[i];
1949 /* Check for low surrogate and combine the two to
1950 form a UCS4 value */
1951 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001952 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001953 i++;
1954 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001955 }
Tim Peters602f7402002-04-27 18:03:26 +00001956 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001957 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001958 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001959 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1960 *p++ = (char)(0x80 | (ch & 0x3f));
1961 continue;
1962 }
1963encodeUCS4:
1964 /* Encode UCS4 Unicode ordinals */
1965 *p++ = (char)(0xf0 | (ch >> 18));
1966 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1967 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1968 *p++ = (char)(0x80 | (ch & 0x3f));
1969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001971
Tim Peters602f7402002-04-27 18:03:26 +00001972 if (v == NULL) {
1973 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001974 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001975 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001976 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001977 }
1978 else {
1979 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001980 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001981 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001982 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001985
Tim Peters602f7402002-04-27 18:03:26 +00001986#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987}
1988
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1990{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991 if (!PyUnicode_Check(unicode)) {
1992 PyErr_BadArgument();
1993 return NULL;
1994 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001995 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1996 PyUnicode_GET_SIZE(unicode),
1997 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998}
1999
Walter Dörwald41980ca2007-08-16 21:55:45 +00002000/* --- UTF-32 Codec ------------------------------------------------------- */
2001
2002PyObject *
2003PyUnicode_DecodeUTF32(const char *s,
2004 Py_ssize_t size,
2005 const char *errors,
2006 int *byteorder)
2007{
2008 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2009}
2010
2011PyObject *
2012PyUnicode_DecodeUTF32Stateful(const char *s,
2013 Py_ssize_t size,
2014 const char *errors,
2015 int *byteorder,
2016 Py_ssize_t *consumed)
2017{
2018 const char *starts = s;
2019 Py_ssize_t startinpos;
2020 Py_ssize_t endinpos;
2021 Py_ssize_t outpos;
2022 PyUnicodeObject *unicode;
2023 Py_UNICODE *p;
2024#ifndef Py_UNICODE_WIDE
2025 int i, pairs;
2026#else
2027 const int pairs = 0;
2028#endif
2029 const unsigned char *q, *e;
2030 int bo = 0; /* assume native ordering by default */
2031 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002032 /* Offsets from q for retrieving bytes in the right order. */
2033#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2034 int iorder[] = {0, 1, 2, 3};
2035#else
2036 int iorder[] = {3, 2, 1, 0};
2037#endif
2038 PyObject *errorHandler = NULL;
2039 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002040 /* On narrow builds we split characters outside the BMP into two
2041 codepoints => count how much extra space we need. */
2042#ifndef Py_UNICODE_WIDE
2043 for (i = pairs = 0; i < size/4; i++)
2044 if (((Py_UCS4 *)s)[i] >= 0x10000)
2045 pairs++;
2046#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002047
2048 /* This might be one to much, because of a BOM */
2049 unicode = _PyUnicode_New((size+3)/4+pairs);
2050 if (!unicode)
2051 return NULL;
2052 if (size == 0)
2053 return (PyObject *)unicode;
2054
2055 /* Unpack UTF-32 encoded data */
2056 p = unicode->str;
2057 q = (unsigned char *)s;
2058 e = q + size;
2059
2060 if (byteorder)
2061 bo = *byteorder;
2062
2063 /* Check for BOM marks (U+FEFF) in the input and adjust current
2064 byte order setting accordingly. In native mode, the leading BOM
2065 mark is skipped, in all other modes, it is copied to the output
2066 stream as-is (giving a ZWNBSP character). */
2067 if (bo == 0) {
2068 if (size >= 4) {
2069 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2070 (q[iorder[1]] << 8) | q[iorder[0]];
2071#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2072 if (bom == 0x0000FEFF) {
2073 q += 4;
2074 bo = -1;
2075 }
2076 else if (bom == 0xFFFE0000) {
2077 q += 4;
2078 bo = 1;
2079 }
2080#else
2081 if (bom == 0x0000FEFF) {
2082 q += 4;
2083 bo = 1;
2084 }
2085 else if (bom == 0xFFFE0000) {
2086 q += 4;
2087 bo = -1;
2088 }
2089#endif
2090 }
2091 }
2092
2093 if (bo == -1) {
2094 /* force LE */
2095 iorder[0] = 0;
2096 iorder[1] = 1;
2097 iorder[2] = 2;
2098 iorder[3] = 3;
2099 }
2100 else if (bo == 1) {
2101 /* force BE */
2102 iorder[0] = 3;
2103 iorder[1] = 2;
2104 iorder[2] = 1;
2105 iorder[3] = 0;
2106 }
2107
2108 while (q < e) {
2109 Py_UCS4 ch;
2110 /* remaining bytes at the end? (size should be divisible by 4) */
2111 if (e-q<4) {
2112 if (consumed)
2113 break;
2114 errmsg = "truncated data";
2115 startinpos = ((const char *)q)-starts;
2116 endinpos = ((const char *)e)-starts;
2117 goto utf32Error;
2118 /* The remaining input chars are ignored if the callback
2119 chooses to skip the input */
2120 }
2121 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2122 (q[iorder[1]] << 8) | q[iorder[0]];
2123
2124 if (ch >= 0x110000)
2125 {
2126 errmsg = "codepoint not in range(0x110000)";
2127 startinpos = ((const char *)q)-starts;
2128 endinpos = startinpos+4;
2129 goto utf32Error;
2130 }
2131#ifndef Py_UNICODE_WIDE
2132 if (ch >= 0x10000)
2133 {
2134 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2135 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2136 }
2137 else
2138#endif
2139 *p++ = ch;
2140 q += 4;
2141 continue;
2142 utf32Error:
2143 outpos = p-PyUnicode_AS_UNICODE(unicode);
2144 if (unicode_decode_call_errorhandler(
2145 errors, &errorHandler,
2146 "utf32", errmsg,
2147 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2148 (PyObject **)&unicode, &outpos, &p))
2149 goto onError;
2150 }
2151
2152 if (byteorder)
2153 *byteorder = bo;
2154
2155 if (consumed)
2156 *consumed = (const char *)q-starts;
2157
2158 /* Adjust length */
2159 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2160 goto onError;
2161
2162 Py_XDECREF(errorHandler);
2163 Py_XDECREF(exc);
2164 return (PyObject *)unicode;
2165
2166onError:
2167 Py_DECREF(unicode);
2168 Py_XDECREF(errorHandler);
2169 Py_XDECREF(exc);
2170 return NULL;
2171}
2172
2173PyObject *
2174PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2175 Py_ssize_t size,
2176 const char *errors,
2177 int byteorder)
2178{
2179 PyObject *v;
2180 unsigned char *p;
2181#ifndef Py_UNICODE_WIDE
2182 int i, pairs;
2183#else
2184 const int pairs = 0;
2185#endif
2186 /* Offsets from p for storing byte pairs in the right order. */
2187#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2188 int iorder[] = {0, 1, 2, 3};
2189#else
2190 int iorder[] = {3, 2, 1, 0};
2191#endif
2192
2193#define STORECHAR(CH) \
2194 do { \
2195 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2196 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2197 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2198 p[iorder[0]] = (CH) & 0xff; \
2199 p += 4; \
2200 } while(0)
2201
2202 /* In narrow builds we can output surrogate pairs as one codepoint,
2203 so we need less space. */
2204#ifndef Py_UNICODE_WIDE
2205 for (i = pairs = 0; i < size-1; i++)
2206 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2207 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2208 pairs++;
2209#endif
2210 v = PyBytes_FromStringAndSize(NULL,
2211 4 * (size - pairs + (byteorder == 0)));
2212 if (v == NULL)
2213 return NULL;
2214
2215 p = (unsigned char *)PyBytes_AS_STRING(v);
2216 if (byteorder == 0)
2217 STORECHAR(0xFEFF);
2218 if (size == 0)
2219 return v;
2220
2221 if (byteorder == -1) {
2222 /* force LE */
2223 iorder[0] = 0;
2224 iorder[1] = 1;
2225 iorder[2] = 2;
2226 iorder[3] = 3;
2227 }
2228 else if (byteorder == 1) {
2229 /* force BE */
2230 iorder[0] = 3;
2231 iorder[1] = 2;
2232 iorder[2] = 1;
2233 iorder[3] = 0;
2234 }
2235
2236 while (size-- > 0) {
2237 Py_UCS4 ch = *s++;
2238#ifndef Py_UNICODE_WIDE
2239 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2240 Py_UCS4 ch2 = *s;
2241 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2242 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2243 s++;
2244 size--;
2245 }
2246 }
2247#endif
2248 STORECHAR(ch);
2249 }
2250 return v;
2251#undef STORECHAR
2252}
2253
2254PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2255{
2256 if (!PyUnicode_Check(unicode)) {
2257 PyErr_BadArgument();
2258 return NULL;
2259 }
2260 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2261 PyUnicode_GET_SIZE(unicode),
2262 NULL,
2263 0);
2264}
2265
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266/* --- UTF-16 Codec ------------------------------------------------------- */
2267
Tim Peters772747b2001-08-09 22:21:55 +00002268PyObject *
2269PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002270 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002271 const char *errors,
2272 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273{
Walter Dörwald69652032004-09-07 20:24:22 +00002274 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2275}
2276
2277PyObject *
2278PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002279 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002280 const char *errors,
2281 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002282 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002283{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002284 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002285 Py_ssize_t startinpos;
2286 Py_ssize_t endinpos;
2287 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 PyUnicodeObject *unicode;
2289 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002290 const unsigned char *q, *e;
2291 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002292 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002293 /* Offsets from q for retrieving byte pairs in the right order. */
2294#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2295 int ihi = 1, ilo = 0;
2296#else
2297 int ihi = 0, ilo = 1;
2298#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002299 PyObject *errorHandler = NULL;
2300 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 /* Note: size will always be longer than the resulting Unicode
2303 character count */
2304 unicode = _PyUnicode_New(size);
2305 if (!unicode)
2306 return NULL;
2307 if (size == 0)
2308 return (PyObject *)unicode;
2309
2310 /* Unpack UTF-16 encoded data */
2311 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002312 q = (unsigned char *)s;
2313 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314
2315 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002316 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002317
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002318 /* Check for BOM marks (U+FEFF) in the input and adjust current
2319 byte order setting accordingly. In native mode, the leading BOM
2320 mark is skipped, in all other modes, it is copied to the output
2321 stream as-is (giving a ZWNBSP character). */
2322 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002323 if (size >= 2) {
2324 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002325#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002326 if (bom == 0xFEFF) {
2327 q += 2;
2328 bo = -1;
2329 }
2330 else if (bom == 0xFFFE) {
2331 q += 2;
2332 bo = 1;
2333 }
Tim Petersced69f82003-09-16 20:30:58 +00002334#else
Walter Dörwald69652032004-09-07 20:24:22 +00002335 if (bom == 0xFEFF) {
2336 q += 2;
2337 bo = 1;
2338 }
2339 else if (bom == 0xFFFE) {
2340 q += 2;
2341 bo = -1;
2342 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002343#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002344 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346
Tim Peters772747b2001-08-09 22:21:55 +00002347 if (bo == -1) {
2348 /* force LE */
2349 ihi = 1;
2350 ilo = 0;
2351 }
2352 else if (bo == 1) {
2353 /* force BE */
2354 ihi = 0;
2355 ilo = 1;
2356 }
2357
2358 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002359 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002360 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002362 if (consumed)
2363 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002364 errmsg = "truncated data";
2365 startinpos = ((const char *)q)-starts;
2366 endinpos = ((const char *)e)-starts;
2367 goto utf16Error;
2368 /* The remaining input chars are ignored if the callback
2369 chooses to skip the input */
2370 }
2371 ch = (q[ihi] << 8) | q[ilo];
2372
Tim Peters772747b2001-08-09 22:21:55 +00002373 q += 2;
2374
Guido van Rossumd57fd912000-03-10 22:53:23 +00002375 if (ch < 0xD800 || ch > 0xDFFF) {
2376 *p++ = ch;
2377 continue;
2378 }
2379
2380 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002381 if (q >= e) {
2382 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002383 startinpos = (((const char *)q)-2)-starts;
2384 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002385 goto utf16Error;
2386 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002387 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002388 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2389 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002390 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002391#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002392 *p++ = ch;
2393 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002394#else
2395 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002396#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002397 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002398 }
2399 else {
2400 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002401 startinpos = (((const char *)q)-4)-starts;
2402 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002403 goto utf16Error;
2404 }
2405
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002407 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002408 startinpos = (((const char *)q)-2)-starts;
2409 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002410 /* Fall through to report the error */
2411
2412 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002413 outpos = p-PyUnicode_AS_UNICODE(unicode);
2414 if (unicode_decode_call_errorhandler(
2415 errors, &errorHandler,
2416 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002417 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002418 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420 }
2421
2422 if (byteorder)
2423 *byteorder = bo;
2424
Walter Dörwald69652032004-09-07 20:24:22 +00002425 if (consumed)
2426 *consumed = (const char *)q-starts;
2427
Guido van Rossumd57fd912000-03-10 22:53:23 +00002428 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002429 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430 goto onError;
2431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002432 Py_XDECREF(errorHandler);
2433 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434 return (PyObject *)unicode;
2435
2436onError:
2437 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002438 Py_XDECREF(errorHandler);
2439 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440 return NULL;
2441}
2442
Tim Peters772747b2001-08-09 22:21:55 +00002443PyObject *
2444PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002445 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002446 const char *errors,
2447 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448{
2449 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002450 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002451#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002452 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002453#else
2454 const int pairs = 0;
2455#endif
Tim Peters772747b2001-08-09 22:21:55 +00002456 /* Offsets from p for storing byte pairs in the right order. */
2457#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2458 int ihi = 1, ilo = 0;
2459#else
2460 int ihi = 0, ilo = 1;
2461#endif
2462
2463#define STORECHAR(CH) \
2464 do { \
2465 p[ihi] = ((CH) >> 8) & 0xff; \
2466 p[ilo] = (CH) & 0xff; \
2467 p += 2; \
2468 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002470#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002471 for (i = pairs = 0; i < size; i++)
2472 if (s[i] >= 0x10000)
2473 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002474#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002475 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002476 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477 if (v == NULL)
2478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479
Walter Dörwald3cc34522007-05-04 10:48:27 +00002480 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002482 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002483 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002484 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002485
2486 if (byteorder == -1) {
2487 /* force LE */
2488 ihi = 1;
2489 ilo = 0;
2490 }
2491 else if (byteorder == 1) {
2492 /* force BE */
2493 ihi = 0;
2494 ilo = 1;
2495 }
2496
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002497 while (size-- > 0) {
2498 Py_UNICODE ch = *s++;
2499 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002500#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002501 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002502 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2503 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002505#endif
Tim Peters772747b2001-08-09 22:21:55 +00002506 STORECHAR(ch);
2507 if (ch2)
2508 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002511#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512}
2513
2514PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2515{
2516 if (!PyUnicode_Check(unicode)) {
2517 PyErr_BadArgument();
2518 return NULL;
2519 }
2520 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2521 PyUnicode_GET_SIZE(unicode),
2522 NULL,
2523 0);
2524}
2525
2526/* --- Unicode Escape Codec ----------------------------------------------- */
2527
Fredrik Lundh06d12682001-01-24 07:59:11 +00002528static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002529
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002531 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 const char *errors)
2533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002535 Py_ssize_t startinpos;
2536 Py_ssize_t endinpos;
2537 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002540 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002542 char* message;
2543 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 PyObject *errorHandler = NULL;
2545 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002546
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 /* Escaped strings will always be longer than the resulting
2548 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 length after conversion to the true value.
2550 (but if the error callback returns a long replacement string
2551 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 v = _PyUnicode_New(size);
2553 if (v == NULL)
2554 goto onError;
2555 if (size == 0)
2556 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002560
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 while (s < end) {
2562 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002563 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565
2566 /* Non-escape characters are interpreted as Unicode ordinals */
2567 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002568 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569 continue;
2570 }
2571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002572 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573 /* \ - Escapes */
2574 s++;
2575 switch (*s++) {
2576
2577 /* \x escapes */
2578 case '\n': break;
2579 case '\\': *p++ = '\\'; break;
2580 case '\'': *p++ = '\''; break;
2581 case '\"': *p++ = '\"'; break;
2582 case 'b': *p++ = '\b'; break;
2583 case 'f': *p++ = '\014'; break; /* FF */
2584 case 't': *p++ = '\t'; break;
2585 case 'n': *p++ = '\n'; break;
2586 case 'r': *p++ = '\r'; break;
2587 case 'v': *p++ = '\013'; break; /* VT */
2588 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2589
2590 /* \OOO (octal) escapes */
2591 case '0': case '1': case '2': case '3':
2592 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002593 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002595 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002597 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002599 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 break;
2601
Fredrik Lundhccc74732001-02-18 22:13:49 +00002602 /* hex escapes */
2603 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002605 digits = 2;
2606 message = "truncated \\xXX escape";
2607 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608
Fredrik Lundhccc74732001-02-18 22:13:49 +00002609 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002611 digits = 4;
2612 message = "truncated \\uXXXX escape";
2613 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614
Fredrik Lundhccc74732001-02-18 22:13:49 +00002615 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002616 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002617 digits = 8;
2618 message = "truncated \\UXXXXXXXX escape";
2619 hexescape:
2620 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 outpos = p-PyUnicode_AS_UNICODE(v);
2622 if (s+digits>end) {
2623 endinpos = size;
2624 if (unicode_decode_call_errorhandler(
2625 errors, &errorHandler,
2626 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002627 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 (PyObject **)&v, &outpos, &p))
2629 goto onError;
2630 goto nextByte;
2631 }
2632 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002633 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002634 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002635 endinpos = (s+i+1)-starts;
2636 if (unicode_decode_call_errorhandler(
2637 errors, &errorHandler,
2638 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002639 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002641 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002642 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002643 }
2644 chr = (chr<<4) & ~0xF;
2645 if (c >= '0' && c <= '9')
2646 chr += c - '0';
2647 else if (c >= 'a' && c <= 'f')
2648 chr += 10 + c - 'a';
2649 else
2650 chr += 10 + c - 'A';
2651 }
2652 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002653 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002654 /* _decoding_error will have already written into the
2655 target buffer. */
2656 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002657 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002658 /* when we get here, chr is a 32-bit unicode character */
2659 if (chr <= 0xffff)
2660 /* UCS-2 character */
2661 *p++ = (Py_UNICODE) chr;
2662 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002663 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002664 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002665#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002666 *p++ = chr;
2667#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002668 chr -= 0x10000L;
2669 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002670 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002671#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002672 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002673 endinpos = s-starts;
2674 outpos = p-PyUnicode_AS_UNICODE(v);
2675 if (unicode_decode_call_errorhandler(
2676 errors, &errorHandler,
2677 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002678 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002680 goto onError;
2681 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002682 break;
2683
2684 /* \N{name} */
2685 case 'N':
2686 message = "malformed \\N character escape";
2687 if (ucnhash_CAPI == NULL) {
2688 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002689 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002690 m = PyImport_ImportModule("unicodedata");
2691 if (m == NULL)
2692 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002693 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002694 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002695 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002696 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002697 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002698 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002699 if (ucnhash_CAPI == NULL)
2700 goto ucnhashError;
2701 }
2702 if (*s == '{') {
2703 const char *start = s+1;
2704 /* look for the closing brace */
2705 while (*s != '}' && s < end)
2706 s++;
2707 if (s > start && s < end && *s == '}') {
2708 /* found a name. look it up in the unicode database */
2709 message = "unknown Unicode character name";
2710 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002711 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002712 goto store;
2713 }
2714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 endinpos = s-starts;
2716 outpos = p-PyUnicode_AS_UNICODE(v);
2717 if (unicode_decode_call_errorhandler(
2718 errors, &errorHandler,
2719 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002720 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002722 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002723 break;
2724
2725 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002726 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 message = "\\ at end of string";
2728 s--;
2729 endinpos = s-starts;
2730 outpos = p-PyUnicode_AS_UNICODE(v);
2731 if (unicode_decode_call_errorhandler(
2732 errors, &errorHandler,
2733 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002734 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002736 goto onError;
2737 }
2738 else {
2739 *p++ = '\\';
2740 *p++ = (unsigned char)s[-1];
2741 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002742 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 nextByte:
2745 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002747 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002749 Py_XDECREF(errorHandler);
2750 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002752
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002754 PyErr_SetString(
2755 PyExc_UnicodeError,
2756 "\\N escapes not supported (can't load unicodedata module)"
2757 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002758 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 Py_XDECREF(errorHandler);
2760 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002761 return NULL;
2762
Fredrik Lundhccc74732001-02-18 22:13:49 +00002763onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765 Py_XDECREF(errorHandler);
2766 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 return NULL;
2768}
2769
2770/* Return a Unicode-Escape string version of the Unicode object.
2771
2772 If quotes is true, the string is enclosed in u"" or u'' quotes as
2773 appropriate.
2774
2775*/
2776
Thomas Wouters477c8d52006-05-27 19:21:47 +00002777Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2778 Py_ssize_t size,
2779 Py_UNICODE ch)
2780{
2781 /* like wcschr, but doesn't stop at NULL characters */
2782
2783 while (size-- > 0) {
2784 if (*s == ch)
2785 return s;
2786 s++;
2787 }
2788
2789 return NULL;
2790}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002791
Walter Dörwald79e913e2007-05-12 11:08:06 +00002792static const char *hexdigits = "0123456789abcdef";
2793
2794PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2795 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796{
2797 PyObject *repr;
2798 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799
Thomas Wouters89f507f2006-12-13 04:49:30 +00002800 /* XXX(nnorwitz): rather than over-allocating, it would be
2801 better to choose a different scheme. Perhaps scan the
2802 first N-chars of the string and allocate based on that size.
2803 */
2804 /* Initial allocation is based on the longest-possible unichr
2805 escape.
2806
2807 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2808 unichr, so in this case it's the longest unichr escape. In
2809 narrow (UTF-16) builds this is five chars per source unichr
2810 since there are two unichrs in the surrogate pair, so in narrow
2811 (UTF-16) builds it's not the longest unichr escape.
2812
2813 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2814 so in the narrow (UTF-16) build case it's the longest unichr
2815 escape.
2816 */
2817
Walter Dörwald79e913e2007-05-12 11:08:06 +00002818 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002819#ifdef Py_UNICODE_WIDE
2820 + 10*size
2821#else
2822 + 6*size
2823#endif
2824 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 if (repr == NULL)
2826 return NULL;
2827
Walter Dörwald79e913e2007-05-12 11:08:06 +00002828 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 while (size-- > 0) {
2831 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002832
Walter Dörwald79e913e2007-05-12 11:08:06 +00002833 /* Escape backslashes */
2834 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 *p++ = '\\';
2836 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002837 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002838 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002839
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002840#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002841 /* Map 21-bit characters to '\U00xxxxxx' */
2842 else if (ch >= 0x10000) {
2843 *p++ = '\\';
2844 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002845 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2846 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2847 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2848 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2849 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2850 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2851 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2852 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002853 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002854 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002855#else
2856 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002857 else if (ch >= 0xD800 && ch < 0xDC00) {
2858 Py_UNICODE ch2;
2859 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002860
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002861 ch2 = *s++;
2862 size--;
2863 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2864 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2865 *p++ = '\\';
2866 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002867 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2868 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2869 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2870 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2871 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2872 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2873 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2874 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002875 continue;
2876 }
2877 /* Fall through: isolated surrogates are copied as-is */
2878 s--;
2879 size++;
2880 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002881#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002882
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002884 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 *p++ = '\\';
2886 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002887 *p++ = hexdigits[(ch >> 12) & 0x000F];
2888 *p++ = hexdigits[(ch >> 8) & 0x000F];
2889 *p++ = hexdigits[(ch >> 4) & 0x000F];
2890 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002892
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002893 /* Map special whitespace to '\t', \n', '\r' */
2894 else if (ch == '\t') {
2895 *p++ = '\\';
2896 *p++ = 't';
2897 }
2898 else if (ch == '\n') {
2899 *p++ = '\\';
2900 *p++ = 'n';
2901 }
2902 else if (ch == '\r') {
2903 *p++ = '\\';
2904 *p++ = 'r';
2905 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002906
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002907 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002908 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002910 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002911 *p++ = hexdigits[(ch >> 4) & 0x000F];
2912 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002913 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002914
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 /* Copy everything else as-is */
2916 else
2917 *p++ = (char) ch;
2918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919
2920 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002921 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2922 Py_DECREF(repr);
2923 return NULL;
2924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925 return repr;
2926}
2927
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2929{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002930 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931 if (!PyUnicode_Check(unicode)) {
2932 PyErr_BadArgument();
2933 return NULL;
2934 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002935 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2936 PyUnicode_GET_SIZE(unicode));
2937
2938 if (!s)
2939 return NULL;
2940 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2941 PyBytes_GET_SIZE(s));
2942 Py_DECREF(s);
2943 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944}
2945
2946/* --- Raw Unicode Escape Codec ------------------------------------------- */
2947
2948PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002949 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 const char *errors)
2951{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002953 Py_ssize_t startinpos;
2954 Py_ssize_t endinpos;
2955 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 const char *end;
2959 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 PyObject *errorHandler = NULL;
2961 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002962
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 /* Escaped strings will always be longer than the resulting
2964 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 length after conversion to the true value. (But decoding error
2966 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 v = _PyUnicode_New(size);
2968 if (v == NULL)
2969 goto onError;
2970 if (size == 0)
2971 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 end = s + size;
2974 while (s < end) {
2975 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002976 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002978 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979
2980 /* Non-escape characters are interpreted as Unicode ordinals */
2981 if (*s != '\\') {
2982 *p++ = (unsigned char)*s++;
2983 continue;
2984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986
2987 /* \u-escapes are only interpreted iff the number of leading
2988 backslashes if odd */
2989 bs = s;
2990 for (;s < end;) {
2991 if (*s != '\\')
2992 break;
2993 *p++ = (unsigned char)*s++;
2994 }
2995 if (((s - bs) & 1) == 0 ||
2996 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002997 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 continue;
2999 }
3000 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003001 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 s++;
3003
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003004 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003006 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003009 endinpos = s-starts;
3010 if (unicode_decode_call_errorhandler(
3011 errors, &errorHandler,
3012 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003013 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003014 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003016 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 }
3018 x = (x<<4) & ~0xF;
3019 if (c >= '0' && c <= '9')
3020 x += c - '0';
3021 else if (c >= 'a' && c <= 'f')
3022 x += 10 + c - 'a';
3023 else
3024 x += 10 + c - 'A';
3025 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003026#ifndef Py_UNICODE_WIDE
3027 if (x > 0x10000) {
3028 if (unicode_decode_call_errorhandler(
3029 errors, &errorHandler,
3030 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003031 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003032 (PyObject **)&v, &outpos, &p))
3033 goto onError;
3034 }
3035#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 *p++ = x;
3037 nextByte:
3038 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003040 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003041 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 Py_XDECREF(errorHandler);
3043 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003045
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 onError:
3047 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 Py_XDECREF(errorHandler);
3049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 return NULL;
3051}
3052
3053PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003054 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055{
3056 PyObject *repr;
3057 char *p;
3058 char *q;
3059
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003060#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003061 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003062#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003063 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003064#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 if (repr == NULL)
3066 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003067 if (size == 0)
3068 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069
Walter Dörwald711005d2007-05-12 12:03:26 +00003070 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 while (size-- > 0) {
3072 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003073#ifdef Py_UNICODE_WIDE
3074 /* Map 32-bit characters to '\Uxxxxxxxx' */
3075 if (ch >= 0x10000) {
3076 *p++ = '\\';
3077 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003078 *p++ = hexdigits[(ch >> 28) & 0xf];
3079 *p++ = hexdigits[(ch >> 24) & 0xf];
3080 *p++ = hexdigits[(ch >> 20) & 0xf];
3081 *p++ = hexdigits[(ch >> 16) & 0xf];
3082 *p++ = hexdigits[(ch >> 12) & 0xf];
3083 *p++ = hexdigits[(ch >> 8) & 0xf];
3084 *p++ = hexdigits[(ch >> 4) & 0xf];
3085 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003086 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003087 else
3088#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 /* Map 16-bit characters to '\uxxxx' */
3090 if (ch >= 256) {
3091 *p++ = '\\';
3092 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003093 *p++ = hexdigits[(ch >> 12) & 0xf];
3094 *p++ = hexdigits[(ch >> 8) & 0xf];
3095 *p++ = hexdigits[(ch >> 4) & 0xf];
3096 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 }
3098 /* Copy everything else as-is */
3099 else
3100 *p++ = (char) ch;
3101 }
3102 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003103 if (PyBytes_Resize(repr, p - q)) {
3104 Py_DECREF(repr);
3105 return NULL;
3106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 return repr;
3108}
3109
3110PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3111{
Walter Dörwald711005d2007-05-12 12:03:26 +00003112 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003114 PyErr_BadArgument();
3115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003117 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3118 PyUnicode_GET_SIZE(unicode));
3119
3120 if (!s)
3121 return NULL;
3122 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3123 PyBytes_GET_SIZE(s));
3124 Py_DECREF(s);
3125 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126}
3127
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003128/* --- Unicode Internal Codec ------------------------------------------- */
3129
3130PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003131 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003132 const char *errors)
3133{
3134 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003135 Py_ssize_t startinpos;
3136 Py_ssize_t endinpos;
3137 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003138 PyUnicodeObject *v;
3139 Py_UNICODE *p;
3140 const char *end;
3141 const char *reason;
3142 PyObject *errorHandler = NULL;
3143 PyObject *exc = NULL;
3144
Neal Norwitzd43069c2006-01-08 01:12:10 +00003145#ifdef Py_UNICODE_WIDE
3146 Py_UNICODE unimax = PyUnicode_GetMax();
3147#endif
3148
Thomas Wouters89f507f2006-12-13 04:49:30 +00003149 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003150 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3151 if (v == NULL)
3152 goto onError;
3153 if (PyUnicode_GetSize((PyObject *)v) == 0)
3154 return (PyObject *)v;
3155 p = PyUnicode_AS_UNICODE(v);
3156 end = s + size;
3157
3158 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003159 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003160 /* We have to sanity check the raw data, otherwise doom looms for
3161 some malformed UCS-4 data. */
3162 if (
3163 #ifdef Py_UNICODE_WIDE
3164 *p > unimax || *p < 0 ||
3165 #endif
3166 end-s < Py_UNICODE_SIZE
3167 )
3168 {
3169 startinpos = s - starts;
3170 if (end-s < Py_UNICODE_SIZE) {
3171 endinpos = end-starts;
3172 reason = "truncated input";
3173 }
3174 else {
3175 endinpos = s - starts + Py_UNICODE_SIZE;
3176 reason = "illegal code point (> 0x10FFFF)";
3177 }
3178 outpos = p - PyUnicode_AS_UNICODE(v);
3179 if (unicode_decode_call_errorhandler(
3180 errors, &errorHandler,
3181 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003182 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003183 (PyObject **)&v, &outpos, &p)) {
3184 goto onError;
3185 }
3186 }
3187 else {
3188 p++;
3189 s += Py_UNICODE_SIZE;
3190 }
3191 }
3192
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003193 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003194 goto onError;
3195 Py_XDECREF(errorHandler);
3196 Py_XDECREF(exc);
3197 return (PyObject *)v;
3198
3199 onError:
3200 Py_XDECREF(v);
3201 Py_XDECREF(errorHandler);
3202 Py_XDECREF(exc);
3203 return NULL;
3204}
3205
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206/* --- Latin-1 Codec ------------------------------------------------------ */
3207
3208PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003209 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 const char *errors)
3211{
3212 PyUnicodeObject *v;
3213 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003214
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003216 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003217 Py_UNICODE r = *(unsigned char*)s;
3218 return PyUnicode_FromUnicode(&r, 1);
3219 }
3220
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 v = _PyUnicode_New(size);
3222 if (v == NULL)
3223 goto onError;
3224 if (size == 0)
3225 return (PyObject *)v;
3226 p = PyUnicode_AS_UNICODE(v);
3227 while (size-- > 0)
3228 *p++ = (unsigned char)*s++;
3229 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003230
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 onError:
3232 Py_XDECREF(v);
3233 return NULL;
3234}
3235
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236/* create or adjust a UnicodeEncodeError */
3237static void make_encode_exception(PyObject **exceptionObject,
3238 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003239 const Py_UNICODE *unicode, Py_ssize_t size,
3240 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 if (*exceptionObject == NULL) {
3244 *exceptionObject = PyUnicodeEncodeError_Create(
3245 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 }
3247 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3249 goto onError;
3250 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3251 goto onError;
3252 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3253 goto onError;
3254 return;
3255 onError:
3256 Py_DECREF(*exceptionObject);
3257 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 }
3259}
3260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261/* raises a UnicodeEncodeError */
3262static void raise_encode_exception(PyObject **exceptionObject,
3263 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003264 const Py_UNICODE *unicode, Py_ssize_t size,
3265 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 const char *reason)
3267{
3268 make_encode_exception(exceptionObject,
3269 encoding, unicode, size, startpos, endpos, reason);
3270 if (*exceptionObject != NULL)
3271 PyCodec_StrictErrors(*exceptionObject);
3272}
3273
3274/* error handling callback helper:
3275 build arguments, call the callback and check the arguments,
3276 put the result into newpos and return the replacement string, which
3277 has to be freed by the caller */
3278static PyObject *unicode_encode_call_errorhandler(const char *errors,
3279 PyObject **errorHandler,
3280 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003281 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3282 Py_ssize_t startpos, Py_ssize_t endpos,
3283 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003285 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286
3287 PyObject *restuple;
3288 PyObject *resunicode;
3289
3290 if (*errorHandler == NULL) {
3291 *errorHandler = PyCodec_LookupError(errors);
3292 if (*errorHandler == NULL)
3293 return NULL;
3294 }
3295
3296 make_encode_exception(exceptionObject,
3297 encoding, unicode, size, startpos, endpos, reason);
3298 if (*exceptionObject == NULL)
3299 return NULL;
3300
3301 restuple = PyObject_CallFunctionObjArgs(
3302 *errorHandler, *exceptionObject, NULL);
3303 if (restuple == NULL)
3304 return NULL;
3305 if (!PyTuple_Check(restuple)) {
3306 PyErr_Format(PyExc_TypeError, &argparse[4]);
3307 Py_DECREF(restuple);
3308 return NULL;
3309 }
3310 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3311 &resunicode, newpos)) {
3312 Py_DECREF(restuple);
3313 return NULL;
3314 }
3315 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003316 *newpos = size+*newpos;
3317 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003318 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003319 Py_DECREF(restuple);
3320 return NULL;
3321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 Py_INCREF(resunicode);
3323 Py_DECREF(restuple);
3324 return resunicode;
3325}
3326
3327static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003328 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 const char *errors,
3330 int limit)
3331{
3332 /* output object */
3333 PyObject *res;
3334 /* pointers to the beginning and end+1 of input */
3335 const Py_UNICODE *startp = p;
3336 const Py_UNICODE *endp = p + size;
3337 /* pointer to the beginning of the unencodable characters */
3338 /* const Py_UNICODE *badp = NULL; */
3339 /* pointer into the output */
3340 char *str;
3341 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003342 Py_ssize_t respos = 0;
3343 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003344 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3345 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 PyObject *errorHandler = NULL;
3347 PyObject *exc = NULL;
3348 /* the following variable is used for caching string comparisons
3349 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3350 int known_errorHandler = -1;
3351
3352 /* allocate enough for a simple encoding without
3353 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003354 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 if (res == NULL)
3356 goto onError;
3357 if (size == 0)
3358 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003359 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360 ressize = size;
3361
3362 while (p<endp) {
3363 Py_UNICODE c = *p;
3364
3365 /* can we encode this? */
3366 if (c<limit) {
3367 /* no overflow check, because we know that the space is enough */
3368 *str++ = (char)c;
3369 ++p;
3370 }
3371 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003372 Py_ssize_t unicodepos = p-startp;
3373 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 Py_ssize_t repsize;
3376 Py_ssize_t newpos;
3377 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 Py_UNICODE *uni2;
3379 /* startpos for collecting unencodable chars */
3380 const Py_UNICODE *collstart = p;
3381 const Py_UNICODE *collend = p;
3382 /* find all unecodable characters */
3383 while ((collend < endp) && ((*collend)>=limit))
3384 ++collend;
3385 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3386 if (known_errorHandler==-1) {
3387 if ((errors==NULL) || (!strcmp(errors, "strict")))
3388 known_errorHandler = 1;
3389 else if (!strcmp(errors, "replace"))
3390 known_errorHandler = 2;
3391 else if (!strcmp(errors, "ignore"))
3392 known_errorHandler = 3;
3393 else if (!strcmp(errors, "xmlcharrefreplace"))
3394 known_errorHandler = 4;
3395 else
3396 known_errorHandler = 0;
3397 }
3398 switch (known_errorHandler) {
3399 case 1: /* strict */
3400 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3401 goto onError;
3402 case 2: /* replace */
3403 while (collstart++<collend)
3404 *str++ = '?'; /* fall through */
3405 case 3: /* ignore */
3406 p = collend;
3407 break;
3408 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003409 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 /* determine replacement size (temporarily (mis)uses p) */
3411 for (p = collstart, repsize = 0; p < collend; ++p) {
3412 if (*p<10)
3413 repsize += 2+1+1;
3414 else if (*p<100)
3415 repsize += 2+2+1;
3416 else if (*p<1000)
3417 repsize += 2+3+1;
3418 else if (*p<10000)
3419 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003420#ifndef Py_UNICODE_WIDE
3421 else
3422 repsize += 2+5+1;
3423#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 else if (*p<100000)
3425 repsize += 2+5+1;
3426 else if (*p<1000000)
3427 repsize += 2+6+1;
3428 else
3429 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003430#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 }
3432 requiredsize = respos+repsize+(endp-collend);
3433 if (requiredsize > ressize) {
3434 if (requiredsize<2*ressize)
3435 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003436 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003438 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 ressize = requiredsize;
3440 }
3441 /* generate replacement (temporarily (mis)uses p) */
3442 for (p = collstart; p < collend; ++p) {
3443 str += sprintf(str, "&#%d;", (int)*p);
3444 }
3445 p = collend;
3446 break;
3447 default:
3448 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3449 encoding, reason, startp, size, &exc,
3450 collstart-startp, collend-startp, &newpos);
3451 if (repunicode == NULL)
3452 goto onError;
3453 /* need more space? (at least enough for what we
3454 have+the replacement+the rest of the string, so
3455 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003456 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 repsize = PyUnicode_GET_SIZE(repunicode);
3458 requiredsize = respos+repsize+(endp-collend);
3459 if (requiredsize > ressize) {
3460 if (requiredsize<2*ressize)
3461 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003462 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 Py_DECREF(repunicode);
3464 goto onError;
3465 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003466 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 ressize = requiredsize;
3468 }
3469 /* check if there is anything unencodable in the replacement
3470 and copy it to the output */
3471 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3472 c = *uni2;
3473 if (c >= limit) {
3474 raise_encode_exception(&exc, encoding, startp, size,
3475 unicodepos, unicodepos+1, reason);
3476 Py_DECREF(repunicode);
3477 goto onError;
3478 }
3479 *str = (char)c;
3480 }
3481 p = startp + newpos;
3482 Py_DECREF(repunicode);
3483 }
3484 }
3485 }
3486 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003487 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 if (respos<ressize)
3489 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003490 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 Py_XDECREF(errorHandler);
3492 Py_XDECREF(exc);
3493 return res;
3494
3495 onError:
3496 Py_XDECREF(res);
3497 Py_XDECREF(errorHandler);
3498 Py_XDECREF(exc);
3499 return NULL;
3500}
3501
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003503 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 const char *errors)
3505{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507}
3508
3509PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3510{
3511 if (!PyUnicode_Check(unicode)) {
3512 PyErr_BadArgument();
3513 return NULL;
3514 }
3515 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3516 PyUnicode_GET_SIZE(unicode),
3517 NULL);
3518}
3519
3520/* --- 7-bit ASCII Codec -------------------------------------------------- */
3521
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003523 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 const char *errors)
3525{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 PyUnicodeObject *v;
3528 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003529 Py_ssize_t startinpos;
3530 Py_ssize_t endinpos;
3531 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 const char *e;
3533 PyObject *errorHandler = NULL;
3534 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003535
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003537 if (size == 1 && *(unsigned char*)s < 128) {
3538 Py_UNICODE r = *(unsigned char*)s;
3539 return PyUnicode_FromUnicode(&r, 1);
3540 }
Tim Petersced69f82003-09-16 20:30:58 +00003541
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 v = _PyUnicode_New(size);
3543 if (v == NULL)
3544 goto onError;
3545 if (size == 0)
3546 return (PyObject *)v;
3547 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 e = s + size;
3549 while (s < e) {
3550 register unsigned char c = (unsigned char)*s;
3551 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 ++s;
3554 }
3555 else {
3556 startinpos = s-starts;
3557 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003558 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 if (unicode_decode_call_errorhandler(
3560 errors, &errorHandler,
3561 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003562 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003567 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003568 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003569 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 Py_XDECREF(errorHandler);
3571 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003573
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 onError:
3575 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 Py_XDECREF(errorHandler);
3577 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 return NULL;
3579}
3580
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003582 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 const char *errors)
3584{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586}
3587
3588PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3589{
3590 if (!PyUnicode_Check(unicode)) {
3591 PyErr_BadArgument();
3592 return NULL;
3593 }
3594 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3595 PyUnicode_GET_SIZE(unicode),
3596 NULL);
3597}
3598
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003599#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003600
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003601/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003602
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003603#if SIZEOF_INT < SIZEOF_SSIZE_T
3604#define NEED_RETRY
3605#endif
3606
3607/* XXX This code is limited to "true" double-byte encodings, as
3608 a) it assumes an incomplete character consists of a single byte, and
3609 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3610 encodings, see IsDBCSLeadByteEx documentation. */
3611
3612static int is_dbcs_lead_byte(const char *s, int offset)
3613{
3614 const char *curr = s + offset;
3615
3616 if (IsDBCSLeadByte(*curr)) {
3617 const char *prev = CharPrev(s, curr);
3618 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3619 }
3620 return 0;
3621}
3622
3623/*
3624 * Decode MBCS string into unicode object. If 'final' is set, converts
3625 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3626 */
3627static int decode_mbcs(PyUnicodeObject **v,
3628 const char *s, /* MBCS string */
3629 int size, /* sizeof MBCS string */
3630 int final)
3631{
3632 Py_UNICODE *p;
3633 Py_ssize_t n = 0;
3634 int usize = 0;
3635
3636 assert(size >= 0);
3637
3638 /* Skip trailing lead-byte unless 'final' is set */
3639 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3640 --size;
3641
3642 /* First get the size of the result */
3643 if (size > 0) {
3644 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3645 if (usize == 0) {
3646 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3647 return -1;
3648 }
3649 }
3650
3651 if (*v == NULL) {
3652 /* Create unicode object */
3653 *v = _PyUnicode_New(usize);
3654 if (*v == NULL)
3655 return -1;
3656 }
3657 else {
3658 /* Extend unicode object */
3659 n = PyUnicode_GET_SIZE(*v);
3660 if (_PyUnicode_Resize(v, n + usize) < 0)
3661 return -1;
3662 }
3663
3664 /* Do the conversion */
3665 if (size > 0) {
3666 p = PyUnicode_AS_UNICODE(*v) + n;
3667 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3668 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3669 return -1;
3670 }
3671 }
3672
3673 return size;
3674}
3675
3676PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3677 Py_ssize_t size,
3678 const char *errors,
3679 Py_ssize_t *consumed)
3680{
3681 PyUnicodeObject *v = NULL;
3682 int done;
3683
3684 if (consumed)
3685 *consumed = 0;
3686
3687#ifdef NEED_RETRY
3688 retry:
3689 if (size > INT_MAX)
3690 done = decode_mbcs(&v, s, INT_MAX, 0);
3691 else
3692#endif
3693 done = decode_mbcs(&v, s, (int)size, !consumed);
3694
3695 if (done < 0) {
3696 Py_XDECREF(v);
3697 return NULL;
3698 }
3699
3700 if (consumed)
3701 *consumed += done;
3702
3703#ifdef NEED_RETRY
3704 if (size > INT_MAX) {
3705 s += done;
3706 size -= done;
3707 goto retry;
3708 }
3709#endif
3710
3711 return (PyObject *)v;
3712}
3713
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003714PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003715 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003716 const char *errors)
3717{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003718 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3719}
3720
3721/*
3722 * Convert unicode into string object (MBCS).
3723 * Returns 0 if succeed, -1 otherwise.
3724 */
3725static int encode_mbcs(PyObject **repr,
3726 const Py_UNICODE *p, /* unicode */
3727 int size) /* size of unicode */
3728{
3729 int mbcssize = 0;
3730 Py_ssize_t n = 0;
3731
3732 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003733
3734 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003735 if (size > 0) {
3736 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3737 if (mbcssize == 0) {
3738 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3739 return -1;
3740 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003741 }
3742
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003743 if (*repr == NULL) {
3744 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003745 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003746 if (*repr == NULL)
3747 return -1;
3748 }
3749 else {
3750 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003751 n = PyBytes_Size(*repr);
3752 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003753 return -1;
3754 }
3755
3756 /* Do the conversion */
3757 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003758 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003759 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3760 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3761 return -1;
3762 }
3763 }
3764
3765 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003766}
3767
3768PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003769 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003770 const char *errors)
3771{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003772 PyObject *repr = NULL;
3773 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003774
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003775#ifdef NEED_RETRY
3776 retry:
3777 if (size > INT_MAX)
3778 ret = encode_mbcs(&repr, p, INT_MAX);
3779 else
3780#endif
3781 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003782
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003783 if (ret < 0) {
3784 Py_XDECREF(repr);
3785 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003786 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003787
3788#ifdef NEED_RETRY
3789 if (size > INT_MAX) {
3790 p += INT_MAX;
3791 size -= INT_MAX;
3792 goto retry;
3793 }
3794#endif
3795
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003796 return repr;
3797}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003798
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003799PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3800{
3801 if (!PyUnicode_Check(unicode)) {
3802 PyErr_BadArgument();
3803 return NULL;
3804 }
3805 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3806 PyUnicode_GET_SIZE(unicode),
3807 NULL);
3808}
3809
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003810#undef NEED_RETRY
3811
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003812#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814/* --- Character Mapping Codec -------------------------------------------- */
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003817 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 PyObject *mapping,
3819 const char *errors)
3820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003822 Py_ssize_t startinpos;
3823 Py_ssize_t endinpos;
3824 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 PyUnicodeObject *v;
3827 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829 PyObject *errorHandler = NULL;
3830 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003831 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003832 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003833
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 /* Default to Latin-1 */
3835 if (mapping == NULL)
3836 return PyUnicode_DecodeLatin1(s, size, errors);
3837
3838 v = _PyUnicode_New(size);
3839 if (v == NULL)
3840 goto onError;
3841 if (size == 0)
3842 return (PyObject *)v;
3843 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003845 if (PyUnicode_CheckExact(mapping)) {
3846 mapstring = PyUnicode_AS_UNICODE(mapping);
3847 maplen = PyUnicode_GET_SIZE(mapping);
3848 while (s < e) {
3849 unsigned char ch = *s;
3850 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003852 if (ch < maplen)
3853 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003855 if (x == 0xfffe) {
3856 /* undefined mapping */
3857 outpos = p-PyUnicode_AS_UNICODE(v);
3858 startinpos = s-starts;
3859 endinpos = startinpos+1;
3860 if (unicode_decode_call_errorhandler(
3861 errors, &errorHandler,
3862 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003863 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003864 (PyObject **)&v, &outpos, &p)) {
3865 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003866 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003867 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003868 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003869 *p++ = x;
3870 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003872 }
3873 else {
3874 while (s < e) {
3875 unsigned char ch = *s;
3876 PyObject *w, *x;
3877
3878 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3879 w = PyInt_FromLong((long)ch);
3880 if (w == NULL)
3881 goto onError;
3882 x = PyObject_GetItem(mapping, w);
3883 Py_DECREF(w);
3884 if (x == NULL) {
3885 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3886 /* No mapping found means: mapping is undefined. */
3887 PyErr_Clear();
3888 x = Py_None;
3889 Py_INCREF(x);
3890 } else
3891 goto onError;
3892 }
3893
3894 /* Apply mapping */
3895 if (PyInt_Check(x)) {
3896 long value = PyInt_AS_LONG(x);
3897 if (value < 0 || value > 65535) {
3898 PyErr_SetString(PyExc_TypeError,
3899 "character mapping must be in range(65536)");
3900 Py_DECREF(x);
3901 goto onError;
3902 }
3903 *p++ = (Py_UNICODE)value;
3904 }
3905 else if (x == Py_None) {
3906 /* undefined mapping */
3907 outpos = p-PyUnicode_AS_UNICODE(v);
3908 startinpos = s-starts;
3909 endinpos = startinpos+1;
3910 if (unicode_decode_call_errorhandler(
3911 errors, &errorHandler,
3912 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003913 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003914 (PyObject **)&v, &outpos, &p)) {
3915 Py_DECREF(x);
3916 goto onError;
3917 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003918 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003919 continue;
3920 }
3921 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003923
3924 if (targetsize == 1)
3925 /* 1-1 mapping */
3926 *p++ = *PyUnicode_AS_UNICODE(x);
3927
3928 else if (targetsize > 1) {
3929 /* 1-n mapping */
3930 if (targetsize > extrachars) {
3931 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3933 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003934 (targetsize << 2);
3935 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003936 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003937 if (_PyUnicode_Resize(&v,
3938 PyUnicode_GET_SIZE(v) + needed) < 0) {
3939 Py_DECREF(x);
3940 goto onError;
3941 }
3942 p = PyUnicode_AS_UNICODE(v) + oldpos;
3943 }
3944 Py_UNICODE_COPY(p,
3945 PyUnicode_AS_UNICODE(x),
3946 targetsize);
3947 p += targetsize;
3948 extrachars -= targetsize;
3949 }
3950 /* 1-0 mapping: skip the character */
3951 }
3952 else {
3953 /* wrong return value */
3954 PyErr_SetString(PyExc_TypeError,
3955 "character mapping must return integer, None or unicode");
3956 Py_DECREF(x);
3957 goto onError;
3958 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003960 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 }
3963 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003964 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 Py_XDECREF(errorHandler);
3967 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003969
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 Py_XDECREF(errorHandler);
3972 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 Py_XDECREF(v);
3974 return NULL;
3975}
3976
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003977/* Charmap encoding: the lookup table */
3978
3979struct encoding_map{
3980 PyObject_HEAD
3981 unsigned char level1[32];
3982 int count2, count3;
3983 unsigned char level23[1];
3984};
3985
3986static PyObject*
3987encoding_map_size(PyObject *obj, PyObject* args)
3988{
3989 struct encoding_map *map = (struct encoding_map*)obj;
3990 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3991 128*map->count3);
3992}
3993
3994static PyMethodDef encoding_map_methods[] = {
3995 {"size", encoding_map_size, METH_NOARGS,
3996 PyDoc_STR("Return the size (in bytes) of this object") },
3997 { 0 }
3998};
3999
4000static void
4001encoding_map_dealloc(PyObject* o)
4002{
4003 PyObject_FREE(o);
4004}
4005
4006static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004007 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004008 "EncodingMap", /*tp_name*/
4009 sizeof(struct encoding_map), /*tp_basicsize*/
4010 0, /*tp_itemsize*/
4011 /* methods */
4012 encoding_map_dealloc, /*tp_dealloc*/
4013 0, /*tp_print*/
4014 0, /*tp_getattr*/
4015 0, /*tp_setattr*/
4016 0, /*tp_compare*/
4017 0, /*tp_repr*/
4018 0, /*tp_as_number*/
4019 0, /*tp_as_sequence*/
4020 0, /*tp_as_mapping*/
4021 0, /*tp_hash*/
4022 0, /*tp_call*/
4023 0, /*tp_str*/
4024 0, /*tp_getattro*/
4025 0, /*tp_setattro*/
4026 0, /*tp_as_buffer*/
4027 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4028 0, /*tp_doc*/
4029 0, /*tp_traverse*/
4030 0, /*tp_clear*/
4031 0, /*tp_richcompare*/
4032 0, /*tp_weaklistoffset*/
4033 0, /*tp_iter*/
4034 0, /*tp_iternext*/
4035 encoding_map_methods, /*tp_methods*/
4036 0, /*tp_members*/
4037 0, /*tp_getset*/
4038 0, /*tp_base*/
4039 0, /*tp_dict*/
4040 0, /*tp_descr_get*/
4041 0, /*tp_descr_set*/
4042 0, /*tp_dictoffset*/
4043 0, /*tp_init*/
4044 0, /*tp_alloc*/
4045 0, /*tp_new*/
4046 0, /*tp_free*/
4047 0, /*tp_is_gc*/
4048};
4049
4050PyObject*
4051PyUnicode_BuildEncodingMap(PyObject* string)
4052{
4053 Py_UNICODE *decode;
4054 PyObject *result;
4055 struct encoding_map *mresult;
4056 int i;
4057 int need_dict = 0;
4058 unsigned char level1[32];
4059 unsigned char level2[512];
4060 unsigned char *mlevel1, *mlevel2, *mlevel3;
4061 int count2 = 0, count3 = 0;
4062
4063 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4064 PyErr_BadArgument();
4065 return NULL;
4066 }
4067 decode = PyUnicode_AS_UNICODE(string);
4068 memset(level1, 0xFF, sizeof level1);
4069 memset(level2, 0xFF, sizeof level2);
4070
4071 /* If there isn't a one-to-one mapping of NULL to \0,
4072 or if there are non-BMP characters, we need to use
4073 a mapping dictionary. */
4074 if (decode[0] != 0)
4075 need_dict = 1;
4076 for (i = 1; i < 256; i++) {
4077 int l1, l2;
4078 if (decode[i] == 0
4079 #ifdef Py_UNICODE_WIDE
4080 || decode[i] > 0xFFFF
4081 #endif
4082 ) {
4083 need_dict = 1;
4084 break;
4085 }
4086 if (decode[i] == 0xFFFE)
4087 /* unmapped character */
4088 continue;
4089 l1 = decode[i] >> 11;
4090 l2 = decode[i] >> 7;
4091 if (level1[l1] == 0xFF)
4092 level1[l1] = count2++;
4093 if (level2[l2] == 0xFF)
4094 level2[l2] = count3++;
4095 }
4096
4097 if (count2 >= 0xFF || count3 >= 0xFF)
4098 need_dict = 1;
4099
4100 if (need_dict) {
4101 PyObject *result = PyDict_New();
4102 PyObject *key, *value;
4103 if (!result)
4104 return NULL;
4105 for (i = 0; i < 256; i++) {
4106 key = value = NULL;
4107 key = PyInt_FromLong(decode[i]);
4108 value = PyInt_FromLong(i);
4109 if (!key || !value)
4110 goto failed1;
4111 if (PyDict_SetItem(result, key, value) == -1)
4112 goto failed1;
4113 Py_DECREF(key);
4114 Py_DECREF(value);
4115 }
4116 return result;
4117 failed1:
4118 Py_XDECREF(key);
4119 Py_XDECREF(value);
4120 Py_DECREF(result);
4121 return NULL;
4122 }
4123
4124 /* Create a three-level trie */
4125 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4126 16*count2 + 128*count3 - 1);
4127 if (!result)
4128 return PyErr_NoMemory();
4129 PyObject_Init(result, &EncodingMapType);
4130 mresult = (struct encoding_map*)result;
4131 mresult->count2 = count2;
4132 mresult->count3 = count3;
4133 mlevel1 = mresult->level1;
4134 mlevel2 = mresult->level23;
4135 mlevel3 = mresult->level23 + 16*count2;
4136 memcpy(mlevel1, level1, 32);
4137 memset(mlevel2, 0xFF, 16*count2);
4138 memset(mlevel3, 0, 128*count3);
4139 count3 = 0;
4140 for (i = 1; i < 256; i++) {
4141 int o1, o2, o3, i2, i3;
4142 if (decode[i] == 0xFFFE)
4143 /* unmapped character */
4144 continue;
4145 o1 = decode[i]>>11;
4146 o2 = (decode[i]>>7) & 0xF;
4147 i2 = 16*mlevel1[o1] + o2;
4148 if (mlevel2[i2] == 0xFF)
4149 mlevel2[i2] = count3++;
4150 o3 = decode[i] & 0x7F;
4151 i3 = 128*mlevel2[i2] + o3;
4152 mlevel3[i3] = i;
4153 }
4154 return result;
4155}
4156
4157static int
4158encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4159{
4160 struct encoding_map *map = (struct encoding_map*)mapping;
4161 int l1 = c>>11;
4162 int l2 = (c>>7) & 0xF;
4163 int l3 = c & 0x7F;
4164 int i;
4165
4166#ifdef Py_UNICODE_WIDE
4167 if (c > 0xFFFF) {
4168 return -1;
4169 }
4170#endif
4171 if (c == 0)
4172 return 0;
4173 /* level 1*/
4174 i = map->level1[l1];
4175 if (i == 0xFF) {
4176 return -1;
4177 }
4178 /* level 2*/
4179 i = map->level23[16*i+l2];
4180 if (i == 0xFF) {
4181 return -1;
4182 }
4183 /* level 3 */
4184 i = map->level23[16*map->count2 + 128*i + l3];
4185 if (i == 0) {
4186 return -1;
4187 }
4188 return i;
4189}
4190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191/* Lookup the character ch in the mapping. If the character
4192 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004193 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 PyObject *w = PyInt_FromLong((long)c);
4197 PyObject *x;
4198
4199 if (w == NULL)
4200 return NULL;
4201 x = PyObject_GetItem(mapping, w);
4202 Py_DECREF(w);
4203 if (x == NULL) {
4204 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4205 /* No mapping found means: mapping is undefined. */
4206 PyErr_Clear();
4207 x = Py_None;
4208 Py_INCREF(x);
4209 return x;
4210 } else
4211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004213 else if (x == Py_None)
4214 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 else if (PyInt_Check(x)) {
4216 long value = PyInt_AS_LONG(x);
4217 if (value < 0 || value > 255) {
4218 PyErr_SetString(PyExc_TypeError,
4219 "character mapping must be in range(256)");
4220 Py_DECREF(x);
4221 return NULL;
4222 }
4223 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 else if (PyString_Check(x))
4226 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004229 PyErr_Format(PyExc_TypeError,
4230 "character mapping must return integer, None or str8, not %.400s",
4231 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 Py_DECREF(x);
4233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234 }
4235}
4236
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004237static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004238charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004239{
Walter Dörwald827b0552007-05-12 13:23:53 +00004240 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004241 /* exponentially overallocate to minimize reallocations */
4242 if (requiredsize < 2*outsize)
4243 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004244 if (PyBytes_Resize(outobj, requiredsize)) {
4245 Py_DECREF(outobj);
4246 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004247 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004248 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004249}
4250
4251typedef enum charmapencode_result {
4252 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4253}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004255 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 space is available. Return a new reference to the object that
4257 was put in the output buffer, or Py_None, if the mapping was undefined
4258 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004259 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004261charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004262 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004264 PyObject *rep;
4265 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004266 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004268 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004269 int res = encoding_map_lookup(c, mapping);
4270 Py_ssize_t requiredsize = *outpos+1;
4271 if (res == -1)
4272 return enc_FAILED;
4273 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004274 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004275 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004276 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004277 outstart[(*outpos)++] = (char)res;
4278 return enc_SUCCESS;
4279 }
4280
4281 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004283 return enc_EXCEPTION;
4284 else if (rep==Py_None) {
4285 Py_DECREF(rep);
4286 return enc_FAILED;
4287 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004289 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004290 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004291 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004293 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004295 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4297 }
4298 else {
4299 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004300 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4301 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004302 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004303 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004305 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004307 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308 memcpy(outstart + *outpos, repchars, repsize);
4309 *outpos += repsize;
4310 }
4311 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004312 Py_DECREF(rep);
4313 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314}
4315
4316/* handle an error in PyUnicode_EncodeCharmap
4317 Return 0 on success, -1 on error */
4318static
4319int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004320 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004322 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004323 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004324{
4325 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004326 Py_ssize_t repsize;
4327 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 Py_UNICODE *uni2;
4329 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004330 Py_ssize_t collstartpos = *inpos;
4331 Py_ssize_t collendpos = *inpos+1;
4332 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 char *encoding = "charmap";
4334 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004335 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004337 /* find all unencodable characters */
4338 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004339 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004340 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341 int res = encoding_map_lookup(p[collendpos], mapping);
4342 if (res != -1)
4343 break;
4344 ++collendpos;
4345 continue;
4346 }
4347
4348 rep = charmapencode_lookup(p[collendpos], mapping);
4349 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004351 else if (rep!=Py_None) {
4352 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 break;
4354 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004355 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 ++collendpos;
4357 }
4358 /* cache callback name lookup
4359 * (if not done yet, i.e. it's the first error) */
4360 if (*known_errorHandler==-1) {
4361 if ((errors==NULL) || (!strcmp(errors, "strict")))
4362 *known_errorHandler = 1;
4363 else if (!strcmp(errors, "replace"))
4364 *known_errorHandler = 2;
4365 else if (!strcmp(errors, "ignore"))
4366 *known_errorHandler = 3;
4367 else if (!strcmp(errors, "xmlcharrefreplace"))
4368 *known_errorHandler = 4;
4369 else
4370 *known_errorHandler = 0;
4371 }
4372 switch (*known_errorHandler) {
4373 case 1: /* strict */
4374 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4375 return -1;
4376 case 2: /* replace */
4377 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4378 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004379 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 return -1;
4381 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004382 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4384 return -1;
4385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 }
4387 /* fall through */
4388 case 3: /* ignore */
4389 *inpos = collendpos;
4390 break;
4391 case 4: /* xmlcharrefreplace */
4392 /* generate replacement (temporarily (mis)uses p) */
4393 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4394 char buffer[2+29+1+1];
4395 char *cp;
4396 sprintf(buffer, "&#%d;", (int)p[collpos]);
4397 for (cp = buffer; *cp; ++cp) {
4398 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004399 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004401 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4403 return -1;
4404 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 }
4406 }
4407 *inpos = collendpos;
4408 break;
4409 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004410 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 encoding, reason, p, size, exceptionObject,
4412 collstartpos, collendpos, &newpos);
4413 if (repunicode == NULL)
4414 return -1;
4415 /* generate replacement */
4416 repsize = PyUnicode_GET_SIZE(repunicode);
4417 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4418 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004419 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 return -1;
4421 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004422 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4425 return -1;
4426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 }
4428 *inpos = newpos;
4429 Py_DECREF(repunicode);
4430 }
4431 return 0;
4432}
4433
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004435 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 PyObject *mapping,
4437 const char *errors)
4438{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 /* output object */
4440 PyObject *res = NULL;
4441 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 PyObject *errorHandler = NULL;
4446 PyObject *exc = NULL;
4447 /* the following variable is used for caching string comparisons
4448 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4449 * 3=ignore, 4=xmlcharrefreplace */
4450 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451
4452 /* Default to Latin-1 */
4453 if (mapping == NULL)
4454 return PyUnicode_EncodeLatin1(p, size, errors);
4455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 /* allocate enough for a simple encoding without
4457 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004458 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 if (res == NULL)
4460 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004461 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 while (inpos<size) {
4465 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004466 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004467 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004469 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 if (charmap_encoding_error(p, size, &inpos, mapping,
4471 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004472 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004473 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004474 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 else
4478 /* done with this character => adjust input position */
4479 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004483 if (respos<PyBytes_GET_SIZE(res)) {
4484 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 goto onError;
4486 }
4487 Py_XDECREF(exc);
4488 Py_XDECREF(errorHandler);
4489 return res;
4490
4491 onError:
4492 Py_XDECREF(res);
4493 Py_XDECREF(exc);
4494 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 return NULL;
4496}
4497
4498PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4499 PyObject *mapping)
4500{
4501 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4502 PyErr_BadArgument();
4503 return NULL;
4504 }
4505 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4506 PyUnicode_GET_SIZE(unicode),
4507 mapping,
4508 NULL);
4509}
4510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511/* create or adjust a UnicodeTranslateError */
4512static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004513 const Py_UNICODE *unicode, Py_ssize_t size,
4514 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 if (*exceptionObject == NULL) {
4518 *exceptionObject = PyUnicodeTranslateError_Create(
4519 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 }
4521 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4523 goto onError;
4524 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4525 goto onError;
4526 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4527 goto onError;
4528 return;
4529 onError:
4530 Py_DECREF(*exceptionObject);
4531 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532 }
4533}
4534
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535/* raises a UnicodeTranslateError */
4536static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004537 const Py_UNICODE *unicode, Py_ssize_t size,
4538 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 const char *reason)
4540{
4541 make_translate_exception(exceptionObject,
4542 unicode, size, startpos, endpos, reason);
4543 if (*exceptionObject != NULL)
4544 PyCodec_StrictErrors(*exceptionObject);
4545}
4546
4547/* error handling callback helper:
4548 build arguments, call the callback and check the arguments,
4549 put the result into newpos and return the replacement string, which
4550 has to be freed by the caller */
4551static PyObject *unicode_translate_call_errorhandler(const char *errors,
4552 PyObject **errorHandler,
4553 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4555 Py_ssize_t startpos, Py_ssize_t endpos,
4556 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004558 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004560 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 PyObject *restuple;
4562 PyObject *resunicode;
4563
4564 if (*errorHandler == NULL) {
4565 *errorHandler = PyCodec_LookupError(errors);
4566 if (*errorHandler == NULL)
4567 return NULL;
4568 }
4569
4570 make_translate_exception(exceptionObject,
4571 unicode, size, startpos, endpos, reason);
4572 if (*exceptionObject == NULL)
4573 return NULL;
4574
4575 restuple = PyObject_CallFunctionObjArgs(
4576 *errorHandler, *exceptionObject, NULL);
4577 if (restuple == NULL)
4578 return NULL;
4579 if (!PyTuple_Check(restuple)) {
4580 PyErr_Format(PyExc_TypeError, &argparse[4]);
4581 Py_DECREF(restuple);
4582 return NULL;
4583 }
4584 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004585 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 Py_DECREF(restuple);
4587 return NULL;
4588 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004589 if (i_newpos<0)
4590 *newpos = size+i_newpos;
4591 else
4592 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004593 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004594 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004595 Py_DECREF(restuple);
4596 return NULL;
4597 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 Py_INCREF(resunicode);
4599 Py_DECREF(restuple);
4600 return resunicode;
4601}
4602
4603/* Lookup the character ch in the mapping and put the result in result,
4604 which must be decrefed by the caller.
4605 Return 0 on success, -1 on error */
4606static
4607int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4608{
4609 PyObject *w = PyInt_FromLong((long)c);
4610 PyObject *x;
4611
4612 if (w == NULL)
4613 return -1;
4614 x = PyObject_GetItem(mapping, w);
4615 Py_DECREF(w);
4616 if (x == NULL) {
4617 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4618 /* No mapping found means: use 1:1 mapping. */
4619 PyErr_Clear();
4620 *result = NULL;
4621 return 0;
4622 } else
4623 return -1;
4624 }
4625 else if (x == Py_None) {
4626 *result = x;
4627 return 0;
4628 }
4629 else if (PyInt_Check(x)) {
4630 long value = PyInt_AS_LONG(x);
4631 long max = PyUnicode_GetMax();
4632 if (value < 0 || value > max) {
4633 PyErr_Format(PyExc_TypeError,
4634 "character mapping must be in range(0x%lx)", max+1);
4635 Py_DECREF(x);
4636 return -1;
4637 }
4638 *result = x;
4639 return 0;
4640 }
4641 else if (PyUnicode_Check(x)) {
4642 *result = x;
4643 return 0;
4644 }
4645 else {
4646 /* wrong return value */
4647 PyErr_SetString(PyExc_TypeError,
4648 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004649 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 return -1;
4651 }
4652}
4653/* ensure that *outobj is at least requiredsize characters long,
4654if not reallocate and adjust various state variables.
4655Return 0 on success, -1 on error */
4656static
Walter Dörwald4894c302003-10-24 14:25:28 +00004657int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004661 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004663 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004665 if (requiredsize < 2 * oldsize)
4666 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004667 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 return -1;
4669 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 }
4671 return 0;
4672}
4673/* lookup the character, put the result in the output string and adjust
4674 various state variables. Return a new reference to the object that
4675 was put in the output buffer in *result, or Py_None, if the mapping was
4676 undefined (in which case no character was written).
4677 The called must decref result.
4678 Return 0 on success, -1 on error. */
4679static
Walter Dörwald4894c302003-10-24 14:25:28 +00004680int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004681 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004682 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683{
Walter Dörwald4894c302003-10-24 14:25:28 +00004684 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 return -1;
4686 if (*res==NULL) {
4687 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004688 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 }
4690 else if (*res==Py_None)
4691 ;
4692 else if (PyInt_Check(*res)) {
4693 /* no overflow check, because we know that the space is enough */
4694 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4695 }
4696 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 if (repsize==1) {
4699 /* no overflow check, because we know that the space is enough */
4700 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4701 }
4702 else if (repsize!=0) {
4703 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004704 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004705 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004706 repsize - 1;
4707 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 return -1;
4709 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4710 *outp += repsize;
4711 }
4712 }
4713 else
4714 return -1;
4715 return 0;
4716}
4717
4718PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004719 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 PyObject *mapping,
4721 const char *errors)
4722{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 /* output object */
4724 PyObject *res = NULL;
4725 /* pointers to the beginning and end+1 of input */
4726 const Py_UNICODE *startp = p;
4727 const Py_UNICODE *endp = p + size;
4728 /* pointer into the output */
4729 Py_UNICODE *str;
4730 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004731 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 char *reason = "character maps to <undefined>";
4733 PyObject *errorHandler = NULL;
4734 PyObject *exc = NULL;
4735 /* the following variable is used for caching string comparisons
4736 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4737 * 3=ignore, 4=xmlcharrefreplace */
4738 int known_errorHandler = -1;
4739
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 if (mapping == NULL) {
4741 PyErr_BadArgument();
4742 return NULL;
4743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744
4745 /* allocate enough for a simple 1:1 translation without
4746 replacements, if we need more, we'll resize */
4747 res = PyUnicode_FromUnicode(NULL, size);
4748 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004749 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 return res;
4752 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 while (p<endp) {
4755 /* try to encode it */
4756 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004757 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 goto onError;
4760 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004761 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 if (x!=Py_None) /* it worked => adjust input pointer */
4763 ++p;
4764 else { /* untranslatable character */
4765 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 Py_ssize_t repsize;
4767 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 Py_UNICODE *uni2;
4769 /* startpos for collecting untranslatable chars */
4770 const Py_UNICODE *collstart = p;
4771 const Py_UNICODE *collend = p+1;
4772 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 /* find all untranslatable characters */
4775 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004776 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 goto onError;
4778 Py_XDECREF(x);
4779 if (x!=Py_None)
4780 break;
4781 ++collend;
4782 }
4783 /* cache callback name lookup
4784 * (if not done yet, i.e. it's the first error) */
4785 if (known_errorHandler==-1) {
4786 if ((errors==NULL) || (!strcmp(errors, "strict")))
4787 known_errorHandler = 1;
4788 else if (!strcmp(errors, "replace"))
4789 known_errorHandler = 2;
4790 else if (!strcmp(errors, "ignore"))
4791 known_errorHandler = 3;
4792 else if (!strcmp(errors, "xmlcharrefreplace"))
4793 known_errorHandler = 4;
4794 else
4795 known_errorHandler = 0;
4796 }
4797 switch (known_errorHandler) {
4798 case 1: /* strict */
4799 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4800 goto onError;
4801 case 2: /* replace */
4802 /* No need to check for space, this is a 1:1 replacement */
4803 for (coll = collstart; coll<collend; ++coll)
4804 *str++ = '?';
4805 /* fall through */
4806 case 3: /* ignore */
4807 p = collend;
4808 break;
4809 case 4: /* xmlcharrefreplace */
4810 /* generate replacement (temporarily (mis)uses p) */
4811 for (p = collstart; p < collend; ++p) {
4812 char buffer[2+29+1+1];
4813 char *cp;
4814 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004815 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4817 goto onError;
4818 for (cp = buffer; *cp; ++cp)
4819 *str++ = *cp;
4820 }
4821 p = collend;
4822 break;
4823 default:
4824 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4825 reason, startp, size, &exc,
4826 collstart-startp, collend-startp, &newpos);
4827 if (repunicode == NULL)
4828 goto onError;
4829 /* generate replacement */
4830 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004831 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4833 Py_DECREF(repunicode);
4834 goto onError;
4835 }
4836 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4837 *str++ = *uni2;
4838 p = startp + newpos;
4839 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 }
4841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 /* Resize if we allocated to much */
4844 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004845 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004846 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004847 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 }
4849 Py_XDECREF(exc);
4850 Py_XDECREF(errorHandler);
4851 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 onError:
4854 Py_XDECREF(res);
4855 Py_XDECREF(exc);
4856 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 return NULL;
4858}
4859
4860PyObject *PyUnicode_Translate(PyObject *str,
4861 PyObject *mapping,
4862 const char *errors)
4863{
4864 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004865
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 str = PyUnicode_FromObject(str);
4867 if (str == NULL)
4868 goto onError;
4869 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4870 PyUnicode_GET_SIZE(str),
4871 mapping,
4872 errors);
4873 Py_DECREF(str);
4874 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004875
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 onError:
4877 Py_XDECREF(str);
4878 return NULL;
4879}
Tim Petersced69f82003-09-16 20:30:58 +00004880
Guido van Rossum9e896b32000-04-05 20:11:21 +00004881/* --- Decimal Encoder ---------------------------------------------------- */
4882
4883int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004884 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004885 char *output,
4886 const char *errors)
4887{
4888 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 PyObject *errorHandler = NULL;
4890 PyObject *exc = NULL;
4891 const char *encoding = "decimal";
4892 const char *reason = "invalid decimal Unicode string";
4893 /* the following variable is used for caching string comparisons
4894 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4895 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004896
4897 if (output == NULL) {
4898 PyErr_BadArgument();
4899 return -1;
4900 }
4901
4902 p = s;
4903 end = s + length;
4904 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004906 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004908 Py_ssize_t repsize;
4909 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 Py_UNICODE *uni2;
4911 Py_UNICODE *collstart;
4912 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004913
Guido van Rossum9e896b32000-04-05 20:11:21 +00004914 if (Py_UNICODE_ISSPACE(ch)) {
4915 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004917 continue;
4918 }
4919 decimal = Py_UNICODE_TODECIMAL(ch);
4920 if (decimal >= 0) {
4921 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004923 continue;
4924 }
Guido van Rossumba477042000-04-06 18:18:10 +00004925 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004926 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004928 continue;
4929 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 /* All other characters are considered unencodable */
4931 collstart = p;
4932 collend = p+1;
4933 while (collend < end) {
4934 if ((0 < *collend && *collend < 256) ||
4935 !Py_UNICODE_ISSPACE(*collend) ||
4936 Py_UNICODE_TODECIMAL(*collend))
4937 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004938 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939 /* cache callback name lookup
4940 * (if not done yet, i.e. it's the first error) */
4941 if (known_errorHandler==-1) {
4942 if ((errors==NULL) || (!strcmp(errors, "strict")))
4943 known_errorHandler = 1;
4944 else if (!strcmp(errors, "replace"))
4945 known_errorHandler = 2;
4946 else if (!strcmp(errors, "ignore"))
4947 known_errorHandler = 3;
4948 else if (!strcmp(errors, "xmlcharrefreplace"))
4949 known_errorHandler = 4;
4950 else
4951 known_errorHandler = 0;
4952 }
4953 switch (known_errorHandler) {
4954 case 1: /* strict */
4955 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4956 goto onError;
4957 case 2: /* replace */
4958 for (p = collstart; p < collend; ++p)
4959 *output++ = '?';
4960 /* fall through */
4961 case 3: /* ignore */
4962 p = collend;
4963 break;
4964 case 4: /* xmlcharrefreplace */
4965 /* generate replacement (temporarily (mis)uses p) */
4966 for (p = collstart; p < collend; ++p)
4967 output += sprintf(output, "&#%d;", (int)*p);
4968 p = collend;
4969 break;
4970 default:
4971 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4972 encoding, reason, s, length, &exc,
4973 collstart-s, collend-s, &newpos);
4974 if (repunicode == NULL)
4975 goto onError;
4976 /* generate replacement */
4977 repsize = PyUnicode_GET_SIZE(repunicode);
4978 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4979 Py_UNICODE ch = *uni2;
4980 if (Py_UNICODE_ISSPACE(ch))
4981 *output++ = ' ';
4982 else {
4983 decimal = Py_UNICODE_TODECIMAL(ch);
4984 if (decimal >= 0)
4985 *output++ = '0' + decimal;
4986 else if (0 < ch && ch < 256)
4987 *output++ = (char)ch;
4988 else {
4989 Py_DECREF(repunicode);
4990 raise_encode_exception(&exc, encoding,
4991 s, length, collstart-s, collend-s, reason);
4992 goto onError;
4993 }
4994 }
4995 }
4996 p = s + newpos;
4997 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998 }
4999 }
5000 /* 0-terminate the output string */
5001 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 Py_XDECREF(exc);
5003 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005004 return 0;
5005
5006 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 Py_XDECREF(exc);
5008 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005009 return -1;
5010}
5011
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012/* --- Helpers ------------------------------------------------------------ */
5013
Eric Smith8c663262007-08-25 02:26:07 +00005014#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005015
5016#include "stringlib/fastsearch.h"
5017
5018#include "stringlib/count.h"
5019#include "stringlib/find.h"
5020#include "stringlib/partition.h"
5021
5022/* helper macro to fixup start/end slice values */
5023#define FIX_START_END(obj) \
5024 if (start < 0) \
5025 start += (obj)->length; \
5026 if (start < 0) \
5027 start = 0; \
5028 if (end > (obj)->length) \
5029 end = (obj)->length; \
5030 if (end < 0) \
5031 end += (obj)->length; \
5032 if (end < 0) \
5033 end = 0;
5034
Martin v. Löwis18e16552006-02-15 17:27:45 +00005035Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005036 PyObject *substr,
5037 Py_ssize_t start,
5038 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005040 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005041 PyUnicodeObject* str_obj;
5042 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005043
Thomas Wouters477c8d52006-05-27 19:21:47 +00005044 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5045 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005047 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5048 if (!sub_obj) {
5049 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 return -1;
5051 }
Tim Petersced69f82003-09-16 20:30:58 +00005052
Thomas Wouters477c8d52006-05-27 19:21:47 +00005053 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005054
Thomas Wouters477c8d52006-05-27 19:21:47 +00005055 result = stringlib_count(
5056 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5057 );
5058
5059 Py_DECREF(sub_obj);
5060 Py_DECREF(str_obj);
5061
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 return result;
5063}
5064
Martin v. Löwis18e16552006-02-15 17:27:45 +00005065Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005066 PyObject *sub,
5067 Py_ssize_t start,
5068 Py_ssize_t end,
5069 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005071 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005072
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005074 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005075 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005076 sub = PyUnicode_FromObject(sub);
5077 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005078 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005079 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 }
Tim Petersced69f82003-09-16 20:30:58 +00005081
Thomas Wouters477c8d52006-05-27 19:21:47 +00005082 if (direction > 0)
5083 result = stringlib_find_slice(
5084 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5085 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5086 start, end
5087 );
5088 else
5089 result = stringlib_rfind_slice(
5090 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5091 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5092 start, end
5093 );
5094
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005096 Py_DECREF(sub);
5097
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 return result;
5099}
5100
Tim Petersced69f82003-09-16 20:30:58 +00005101static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102int tailmatch(PyUnicodeObject *self,
5103 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005104 Py_ssize_t start,
5105 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 int direction)
5107{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 if (substring->length == 0)
5109 return 1;
5110
Thomas Wouters477c8d52006-05-27 19:21:47 +00005111 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112
5113 end -= substring->length;
5114 if (end < start)
5115 return 0;
5116
5117 if (direction > 0) {
5118 if (Py_UNICODE_MATCH(self, end, substring))
5119 return 1;
5120 } else {
5121 if (Py_UNICODE_MATCH(self, start, substring))
5122 return 1;
5123 }
5124
5125 return 0;
5126}
5127
Martin v. Löwis18e16552006-02-15 17:27:45 +00005128Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005130 Py_ssize_t start,
5131 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 int direction)
5133{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005134 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005135
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 str = PyUnicode_FromObject(str);
5137 if (str == NULL)
5138 return -1;
5139 substr = PyUnicode_FromObject(substr);
5140 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005141 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 return -1;
5143 }
Tim Petersced69f82003-09-16 20:30:58 +00005144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 result = tailmatch((PyUnicodeObject *)str,
5146 (PyUnicodeObject *)substr,
5147 start, end, direction);
5148 Py_DECREF(str);
5149 Py_DECREF(substr);
5150 return result;
5151}
5152
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153/* Apply fixfct filter to the Unicode object self and return a
5154 reference to the modified object */
5155
Tim Petersced69f82003-09-16 20:30:58 +00005156static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157PyObject *fixup(PyUnicodeObject *self,
5158 int (*fixfct)(PyUnicodeObject *s))
5159{
5160
5161 PyUnicodeObject *u;
5162
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005163 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 if (u == NULL)
5165 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005166
5167 Py_UNICODE_COPY(u->str, self->str, self->length);
5168
Tim Peters7a29bd52001-09-12 03:03:31 +00005169 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 /* fixfct should return TRUE if it modified the buffer. If
5171 FALSE, return a reference to the original buffer instead
5172 (to save space, not time) */
5173 Py_INCREF(self);
5174 Py_DECREF(u);
5175 return (PyObject*) self;
5176 }
5177 return (PyObject*) u;
5178}
5179
Tim Petersced69f82003-09-16 20:30:58 +00005180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181int fixupper(PyUnicodeObject *self)
5182{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005183 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 Py_UNICODE *s = self->str;
5185 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 while (len-- > 0) {
5188 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005189
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 ch = Py_UNICODE_TOUPPER(*s);
5191 if (ch != *s) {
5192 status = 1;
5193 *s = ch;
5194 }
5195 s++;
5196 }
5197
5198 return status;
5199}
5200
Tim Petersced69f82003-09-16 20:30:58 +00005201static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202int fixlower(PyUnicodeObject *self)
5203{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 Py_UNICODE *s = self->str;
5206 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005207
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 while (len-- > 0) {
5209 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005210
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 ch = Py_UNICODE_TOLOWER(*s);
5212 if (ch != *s) {
5213 status = 1;
5214 *s = ch;
5215 }
5216 s++;
5217 }
5218
5219 return status;
5220}
5221
Tim Petersced69f82003-09-16 20:30:58 +00005222static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223int fixswapcase(PyUnicodeObject *self)
5224{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005225 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 Py_UNICODE *s = self->str;
5227 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005228
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 while (len-- > 0) {
5230 if (Py_UNICODE_ISUPPER(*s)) {
5231 *s = Py_UNICODE_TOLOWER(*s);
5232 status = 1;
5233 } else if (Py_UNICODE_ISLOWER(*s)) {
5234 *s = Py_UNICODE_TOUPPER(*s);
5235 status = 1;
5236 }
5237 s++;
5238 }
5239
5240 return status;
5241}
5242
Tim Petersced69f82003-09-16 20:30:58 +00005243static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244int fixcapitalize(PyUnicodeObject *self)
5245{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005246 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005247 Py_UNICODE *s = self->str;
5248 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005250 if (len == 0)
5251 return 0;
5252 if (Py_UNICODE_ISLOWER(*s)) {
5253 *s = Py_UNICODE_TOUPPER(*s);
5254 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005256 s++;
5257 while (--len > 0) {
5258 if (Py_UNICODE_ISUPPER(*s)) {
5259 *s = Py_UNICODE_TOLOWER(*s);
5260 status = 1;
5261 }
5262 s++;
5263 }
5264 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265}
5266
5267static
5268int fixtitle(PyUnicodeObject *self)
5269{
5270 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5271 register Py_UNICODE *e;
5272 int previous_is_cased;
5273
5274 /* Shortcut for single character strings */
5275 if (PyUnicode_GET_SIZE(self) == 1) {
5276 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5277 if (*p != ch) {
5278 *p = ch;
5279 return 1;
5280 }
5281 else
5282 return 0;
5283 }
Tim Petersced69f82003-09-16 20:30:58 +00005284
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 e = p + PyUnicode_GET_SIZE(self);
5286 previous_is_cased = 0;
5287 for (; p < e; p++) {
5288 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005289
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 if (previous_is_cased)
5291 *p = Py_UNICODE_TOLOWER(ch);
5292 else
5293 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005294
5295 if (Py_UNICODE_ISLOWER(ch) ||
5296 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 Py_UNICODE_ISTITLE(ch))
5298 previous_is_cased = 1;
5299 else
5300 previous_is_cased = 0;
5301 }
5302 return 1;
5303}
5304
Tim Peters8ce9f162004-08-27 01:49:32 +00005305PyObject *
5306PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307{
Tim Peters8ce9f162004-08-27 01:49:32 +00005308 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005309 const Py_UNICODE blank = ' ';
5310 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005311 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005312 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005313 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5314 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005315 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5316 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005317 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005318 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005319 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320
Tim Peters05eba1f2004-08-27 21:32:02 +00005321 fseq = PySequence_Fast(seq, "");
5322 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005323 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005324 }
5325
Tim Peters91879ab2004-08-27 22:35:44 +00005326 /* Grrrr. A codec may be invoked to convert str objects to
5327 * Unicode, and so it's possible to call back into Python code
5328 * during PyUnicode_FromObject(), and so it's possible for a sick
5329 * codec to change the size of fseq (if seq is a list). Therefore
5330 * we have to keep refetching the size -- can't assume seqlen
5331 * is invariant.
5332 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005333 seqlen = PySequence_Fast_GET_SIZE(fseq);
5334 /* If empty sequence, return u"". */
5335 if (seqlen == 0) {
5336 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5337 goto Done;
5338 }
5339 /* If singleton sequence with an exact Unicode, return that. */
5340 if (seqlen == 1) {
5341 item = PySequence_Fast_GET_ITEM(fseq, 0);
5342 if (PyUnicode_CheckExact(item)) {
5343 Py_INCREF(item);
5344 res = (PyUnicodeObject *)item;
5345 goto Done;
5346 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005347 }
5348
Tim Peters05eba1f2004-08-27 21:32:02 +00005349 /* At least two items to join, or one that isn't exact Unicode. */
5350 if (seqlen > 1) {
5351 /* Set up sep and seplen -- they're needed. */
5352 if (separator == NULL) {
5353 sep = &blank;
5354 seplen = 1;
5355 }
5356 else {
5357 internal_separator = PyUnicode_FromObject(separator);
5358 if (internal_separator == NULL)
5359 goto onError;
5360 sep = PyUnicode_AS_UNICODE(internal_separator);
5361 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005362 /* In case PyUnicode_FromObject() mutated seq. */
5363 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005364 }
5365 }
5366
5367 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005368 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005369 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005370 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005371 res_p = PyUnicode_AS_UNICODE(res);
5372 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005373
Tim Peters05eba1f2004-08-27 21:32:02 +00005374 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005375 Py_ssize_t itemlen;
5376 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005377
5378 item = PySequence_Fast_GET_ITEM(fseq, i);
5379 /* Convert item to Unicode. */
5380 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5381 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005382 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005383 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005384 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005385 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005386 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005387 item = PyUnicode_FromObject(item);
5388 if (item == NULL)
5389 goto onError;
5390 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005391
Tim Peters91879ab2004-08-27 22:35:44 +00005392 /* In case PyUnicode_FromObject() mutated seq. */
5393 seqlen = PySequence_Fast_GET_SIZE(fseq);
5394
Tim Peters8ce9f162004-08-27 01:49:32 +00005395 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005397 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005398 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005399 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005400 if (i < seqlen - 1) {
5401 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005402 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005403 goto Overflow;
5404 }
5405 if (new_res_used > res_alloc) {
5406 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005407 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005408 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005409 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005410 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005411 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005412 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005413 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005415 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005416 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005418
5419 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005420 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005421 res_p += itemlen;
5422 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005423 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005424 res_p += seplen;
5425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005427 res_used = new_res_used;
5428 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005429
Tim Peters05eba1f2004-08-27 21:32:02 +00005430 /* Shrink res to match the used area; this probably can't fail,
5431 * but it's cheap to check.
5432 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005433 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005434 goto onError;
5435
5436 Done:
5437 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005438 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 return (PyObject *)res;
5440
Tim Peters8ce9f162004-08-27 01:49:32 +00005441 Overflow:
5442 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005443 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005444 Py_DECREF(item);
5445 /* fall through */
5446
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005448 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005449 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005450 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 return NULL;
5452}
5453
Tim Petersced69f82003-09-16 20:30:58 +00005454static
5455PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005456 Py_ssize_t left,
5457 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 Py_UNICODE fill)
5459{
5460 PyUnicodeObject *u;
5461
5462 if (left < 0)
5463 left = 0;
5464 if (right < 0)
5465 right = 0;
5466
Tim Peters7a29bd52001-09-12 03:03:31 +00005467 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 Py_INCREF(self);
5469 return self;
5470 }
5471
5472 u = _PyUnicode_New(left + self->length + right);
5473 if (u) {
5474 if (left)
5475 Py_UNICODE_FILL(u->str, fill, left);
5476 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5477 if (right)
5478 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5479 }
5480
5481 return u;
5482}
5483
5484#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005485 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 if (!str) \
5487 goto onError; \
5488 if (PyList_Append(list, str)) { \
5489 Py_DECREF(str); \
5490 goto onError; \
5491 } \
5492 else \
5493 Py_DECREF(str);
5494
5495static
5496PyObject *split_whitespace(PyUnicodeObject *self,
5497 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005498 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005500 register Py_ssize_t i;
5501 register Py_ssize_t j;
5502 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 PyObject *str;
5504
5505 for (i = j = 0; i < len; ) {
5506 /* find a token */
5507 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5508 i++;
5509 j = i;
5510 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5511 i++;
5512 if (j < i) {
5513 if (maxcount-- <= 0)
5514 break;
5515 SPLIT_APPEND(self->str, j, i);
5516 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5517 i++;
5518 j = i;
5519 }
5520 }
5521 if (j < len) {
5522 SPLIT_APPEND(self->str, j, len);
5523 }
5524 return list;
5525
5526 onError:
5527 Py_DECREF(list);
5528 return NULL;
5529}
5530
5531PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005532 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005534 register Py_ssize_t i;
5535 register Py_ssize_t j;
5536 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 PyObject *list;
5538 PyObject *str;
5539 Py_UNICODE *data;
5540
5541 string = PyUnicode_FromObject(string);
5542 if (string == NULL)
5543 return NULL;
5544 data = PyUnicode_AS_UNICODE(string);
5545 len = PyUnicode_GET_SIZE(string);
5546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 list = PyList_New(0);
5548 if (!list)
5549 goto onError;
5550
5551 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005555 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
5558 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005559 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 if (i < len) {
5561 if (data[i] == '\r' && i + 1 < len &&
5562 data[i+1] == '\n')
5563 i += 2;
5564 else
5565 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005566 if (keepends)
5567 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 }
Guido van Rossum86662912000-04-11 15:38:46 +00005569 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 j = i;
5571 }
5572 if (j < len) {
5573 SPLIT_APPEND(data, j, len);
5574 }
5575
5576 Py_DECREF(string);
5577 return list;
5578
5579 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005580 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 Py_DECREF(string);
5582 return NULL;
5583}
5584
Tim Petersced69f82003-09-16 20:30:58 +00005585static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586PyObject *split_char(PyUnicodeObject *self,
5587 PyObject *list,
5588 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005589 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005591 register Py_ssize_t i;
5592 register Py_ssize_t j;
5593 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 PyObject *str;
5595
5596 for (i = j = 0; i < len; ) {
5597 if (self->str[i] == ch) {
5598 if (maxcount-- <= 0)
5599 break;
5600 SPLIT_APPEND(self->str, j, i);
5601 i = j = i + 1;
5602 } else
5603 i++;
5604 }
5605 if (j <= len) {
5606 SPLIT_APPEND(self->str, j, len);
5607 }
5608 return list;
5609
5610 onError:
5611 Py_DECREF(list);
5612 return NULL;
5613}
5614
Tim Petersced69f82003-09-16 20:30:58 +00005615static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616PyObject *split_substring(PyUnicodeObject *self,
5617 PyObject *list,
5618 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005619 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005621 register Py_ssize_t i;
5622 register Py_ssize_t j;
5623 Py_ssize_t len = self->length;
5624 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 PyObject *str;
5626
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005627 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 if (Py_UNICODE_MATCH(self, i, substring)) {
5629 if (maxcount-- <= 0)
5630 break;
5631 SPLIT_APPEND(self->str, j, i);
5632 i = j = i + sublen;
5633 } else
5634 i++;
5635 }
5636 if (j <= len) {
5637 SPLIT_APPEND(self->str, j, len);
5638 }
5639 return list;
5640
5641 onError:
5642 Py_DECREF(list);
5643 return NULL;
5644}
5645
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005646static
5647PyObject *rsplit_whitespace(PyUnicodeObject *self,
5648 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005649 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005650{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005651 register Py_ssize_t i;
5652 register Py_ssize_t j;
5653 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005654 PyObject *str;
5655
5656 for (i = j = len - 1; i >= 0; ) {
5657 /* find a token */
5658 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5659 i--;
5660 j = i;
5661 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5662 i--;
5663 if (j > i) {
5664 if (maxcount-- <= 0)
5665 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005666 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005667 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5668 i--;
5669 j = i;
5670 }
5671 }
5672 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005673 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005674 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005675 if (PyList_Reverse(list) < 0)
5676 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005677 return list;
5678
5679 onError:
5680 Py_DECREF(list);
5681 return NULL;
5682}
5683
5684static
5685PyObject *rsplit_char(PyUnicodeObject *self,
5686 PyObject *list,
5687 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005688 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005689{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005690 register Py_ssize_t i;
5691 register Py_ssize_t j;
5692 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005693 PyObject *str;
5694
5695 for (i = j = len - 1; i >= 0; ) {
5696 if (self->str[i] == ch) {
5697 if (maxcount-- <= 0)
5698 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005699 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005700 j = i = i - 1;
5701 } else
5702 i--;
5703 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005704 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005705 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005706 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005707 if (PyList_Reverse(list) < 0)
5708 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005709 return list;
5710
5711 onError:
5712 Py_DECREF(list);
5713 return NULL;
5714}
5715
5716static
5717PyObject *rsplit_substring(PyUnicodeObject *self,
5718 PyObject *list,
5719 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005720 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005721{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005722 register Py_ssize_t i;
5723 register Py_ssize_t j;
5724 Py_ssize_t len = self->length;
5725 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005726 PyObject *str;
5727
5728 for (i = len - sublen, j = len; i >= 0; ) {
5729 if (Py_UNICODE_MATCH(self, i, substring)) {
5730 if (maxcount-- <= 0)
5731 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005732 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005733 j = i;
5734 i -= sublen;
5735 } else
5736 i--;
5737 }
5738 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005739 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005740 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005741 if (PyList_Reverse(list) < 0)
5742 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005743 return list;
5744
5745 onError:
5746 Py_DECREF(list);
5747 return NULL;
5748}
5749
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750#undef SPLIT_APPEND
5751
5752static
5753PyObject *split(PyUnicodeObject *self,
5754 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005755 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756{
5757 PyObject *list;
5758
5759 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005760 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
5762 list = PyList_New(0);
5763 if (!list)
5764 return NULL;
5765
5766 if (substring == NULL)
5767 return split_whitespace(self,list,maxcount);
5768
5769 else if (substring->length == 1)
5770 return split_char(self,list,substring->str[0],maxcount);
5771
5772 else if (substring->length == 0) {
5773 Py_DECREF(list);
5774 PyErr_SetString(PyExc_ValueError, "empty separator");
5775 return NULL;
5776 }
5777 else
5778 return split_substring(self,list,substring,maxcount);
5779}
5780
Tim Petersced69f82003-09-16 20:30:58 +00005781static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005782PyObject *rsplit(PyUnicodeObject *self,
5783 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005784 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005785{
5786 PyObject *list;
5787
5788 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005789 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790
5791 list = PyList_New(0);
5792 if (!list)
5793 return NULL;
5794
5795 if (substring == NULL)
5796 return rsplit_whitespace(self,list,maxcount);
5797
5798 else if (substring->length == 1)
5799 return rsplit_char(self,list,substring->str[0],maxcount);
5800
5801 else if (substring->length == 0) {
5802 Py_DECREF(list);
5803 PyErr_SetString(PyExc_ValueError, "empty separator");
5804 return NULL;
5805 }
5806 else
5807 return rsplit_substring(self,list,substring,maxcount);
5808}
5809
5810static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811PyObject *replace(PyUnicodeObject *self,
5812 PyUnicodeObject *str1,
5813 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815{
5816 PyUnicodeObject *u;
5817
5818 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005819 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820
Thomas Wouters477c8d52006-05-27 19:21:47 +00005821 if (str1->length == str2->length) {
5822 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005823 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005824 if (str1->length == 1) {
5825 /* replace characters */
5826 Py_UNICODE u1, u2;
5827 if (!findchar(self->str, self->length, str1->str[0]))
5828 goto nothing;
5829 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5830 if (!u)
5831 return NULL;
5832 Py_UNICODE_COPY(u->str, self->str, self->length);
5833 u1 = str1->str[0];
5834 u2 = str2->str[0];
5835 for (i = 0; i < u->length; i++)
5836 if (u->str[i] == u1) {
5837 if (--maxcount < 0)
5838 break;
5839 u->str[i] = u2;
5840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842 i = fastsearch(
5843 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005845 if (i < 0)
5846 goto nothing;
5847 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5848 if (!u)
5849 return NULL;
5850 Py_UNICODE_COPY(u->str, self->str, self->length);
5851 while (i <= self->length - str1->length)
5852 if (Py_UNICODE_MATCH(self, i, str1)) {
5853 if (--maxcount < 0)
5854 break;
5855 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5856 i += str1->length;
5857 } else
5858 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005861
5862 Py_ssize_t n, i, j, e;
5863 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 Py_UNICODE *p;
5865
5866 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005867 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 if (n > maxcount)
5869 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005870 if (n == 0)
5871 goto nothing;
5872 /* new_size = self->length + n * (str2->length - str1->length)); */
5873 delta = (str2->length - str1->length);
5874 if (delta == 0) {
5875 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005877 product = n * (str2->length - str1->length);
5878 if ((product / (str2->length - str1->length)) != n) {
5879 PyErr_SetString(PyExc_OverflowError,
5880 "replace string is too long");
5881 return NULL;
5882 }
5883 new_size = self->length + product;
5884 if (new_size < 0) {
5885 PyErr_SetString(PyExc_OverflowError,
5886 "replace string is too long");
5887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 }
5889 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005890 u = _PyUnicode_New(new_size);
5891 if (!u)
5892 return NULL;
5893 i = 0;
5894 p = u->str;
5895 e = self->length - str1->length;
5896 if (str1->length > 0) {
5897 while (n-- > 0) {
5898 /* look for next match */
5899 j = i;
5900 while (j <= e) {
5901 if (Py_UNICODE_MATCH(self, j, str1))
5902 break;
5903 j++;
5904 }
5905 if (j > i) {
5906 if (j > e)
5907 break;
5908 /* copy unchanged part [i:j] */
5909 Py_UNICODE_COPY(p, self->str+i, j-i);
5910 p += j - i;
5911 }
5912 /* copy substitution string */
5913 if (str2->length > 0) {
5914 Py_UNICODE_COPY(p, str2->str, str2->length);
5915 p += str2->length;
5916 }
5917 i = j + str1->length;
5918 }
5919 if (i < self->length)
5920 /* copy tail [i:] */
5921 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5922 } else {
5923 /* interleave */
5924 while (n > 0) {
5925 Py_UNICODE_COPY(p, str2->str, str2->length);
5926 p += str2->length;
5927 if (--n <= 0)
5928 break;
5929 *p++ = self->str[i++];
5930 }
5931 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005935
5936nothing:
5937 /* nothing to replace; return original string (when possible) */
5938 if (PyUnicode_CheckExact(self)) {
5939 Py_INCREF(self);
5940 return (PyObject *) self;
5941 }
5942 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943}
5944
5945/* --- Unicode Object Methods --------------------------------------------- */
5946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005947PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948"S.title() -> unicode\n\
5949\n\
5950Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
5953static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005954unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 return fixup(self, fixtitle);
5957}
5958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005959PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960"S.capitalize() -> unicode\n\
5961\n\
5962Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005963have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
5965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005966unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 return fixup(self, fixcapitalize);
5969}
5970
5971#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005972PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973"S.capwords() -> unicode\n\
5974\n\
5975Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005976normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
5978static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005979unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980{
5981 PyObject *list;
5982 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005983 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 /* Split into words */
5986 list = split(self, NULL, -1);
5987 if (!list)
5988 return NULL;
5989
5990 /* Capitalize each word */
5991 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5992 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5993 fixcapitalize);
5994 if (item == NULL)
5995 goto onError;
5996 Py_DECREF(PyList_GET_ITEM(list, i));
5997 PyList_SET_ITEM(list, i, item);
5998 }
5999
6000 /* Join the words to form a new string */
6001 item = PyUnicode_Join(NULL, list);
6002
6003onError:
6004 Py_DECREF(list);
6005 return (PyObject *)item;
6006}
6007#endif
6008
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006009/* Argument converter. Coerces to a single unicode character */
6010
6011static int
6012convert_uc(PyObject *obj, void *addr)
6013{
6014 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6015 PyObject *uniobj;
6016 Py_UNICODE *unistr;
6017
6018 uniobj = PyUnicode_FromObject(obj);
6019 if (uniobj == NULL) {
6020 PyErr_SetString(PyExc_TypeError,
6021 "The fill character cannot be converted to Unicode");
6022 return 0;
6023 }
6024 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6025 PyErr_SetString(PyExc_TypeError,
6026 "The fill character must be exactly one character long");
6027 Py_DECREF(uniobj);
6028 return 0;
6029 }
6030 unistr = PyUnicode_AS_UNICODE(uniobj);
6031 *fillcharloc = unistr[0];
6032 Py_DECREF(uniobj);
6033 return 1;
6034}
6035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006036PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006037"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006039Return S centered in a Unicode string of length width. Padding is\n\
6040done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
6042static PyObject *
6043unicode_center(PyUnicodeObject *self, PyObject *args)
6044{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006045 Py_ssize_t marg, left;
6046 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006047 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
Thomas Woutersde017742006-02-16 19:34:37 +00006049 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 return NULL;
6051
Tim Peters7a29bd52001-09-12 03:03:31 +00006052 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 Py_INCREF(self);
6054 return (PyObject*) self;
6055 }
6056
6057 marg = width - self->length;
6058 left = marg / 2 + (marg & width & 1);
6059
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006060 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061}
6062
Marc-André Lemburge5034372000-08-08 08:04:29 +00006063#if 0
6064
6065/* This code should go into some future Unicode collation support
6066 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006067 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006068
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006069/* speedy UTF-16 code point order comparison */
6070/* gleaned from: */
6071/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6072
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006073static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006074{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006075 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006076 0, 0, 0, 0, 0, 0, 0, 0,
6077 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006078 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006079};
6080
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081static int
6082unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6083{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006084 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006085
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 Py_UNICODE *s1 = str1->str;
6087 Py_UNICODE *s2 = str2->str;
6088
6089 len1 = str1->length;
6090 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006091
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006093 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006094
6095 c1 = *s1++;
6096 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006097
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006098 if (c1 > (1<<11) * 26)
6099 c1 += utf16Fixup[c1>>11];
6100 if (c2 > (1<<11) * 26)
6101 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006102 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006103
6104 if (c1 != c2)
6105 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006106
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006107 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
6109
6110 return (len1 < len2) ? -1 : (len1 != len2);
6111}
6112
Marc-André Lemburge5034372000-08-08 08:04:29 +00006113#else
6114
6115static int
6116unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6117{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006118 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006119
6120 Py_UNICODE *s1 = str1->str;
6121 Py_UNICODE *s2 = str2->str;
6122
6123 len1 = str1->length;
6124 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006125
Marc-André Lemburge5034372000-08-08 08:04:29 +00006126 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006127 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006128
Fredrik Lundh45714e92001-06-26 16:39:36 +00006129 c1 = *s1++;
6130 c2 = *s2++;
6131
6132 if (c1 != c2)
6133 return (c1 < c2) ? -1 : 1;
6134
Marc-André Lemburge5034372000-08-08 08:04:29 +00006135 len1--; len2--;
6136 }
6137
6138 return (len1 < len2) ? -1 : (len1 != len2);
6139}
6140
6141#endif
6142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143int PyUnicode_Compare(PyObject *left,
6144 PyObject *right)
6145{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006146 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6147 return unicode_compare((PyUnicodeObject *)left,
6148 (PyUnicodeObject *)right);
6149 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6150 (PyUnicode_Check(left) && PyString_Check(right))) {
6151 if (PyUnicode_Check(left))
6152 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6153 if (PyUnicode_Check(right))
6154 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6155 assert(PyString_Check(left));
6156 assert(PyString_Check(right));
6157 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006159 PyErr_Format(PyExc_TypeError,
6160 "Can't compare %.100s and %.100s",
6161 left->ob_type->tp_name,
6162 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 return -1;
6164}
6165
Martin v. Löwis5b222132007-06-10 09:51:05 +00006166int
6167PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6168{
6169 int i;
6170 Py_UNICODE *id;
6171 assert(PyUnicode_Check(uni));
6172 id = PyUnicode_AS_UNICODE(uni);
6173 /* Compare Unicode string and source character set string */
6174 for (i = 0; id[i] && str[i]; i++)
6175 if (id[i] != str[i])
6176 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6177 if (id[i])
6178 return 1; /* uni is longer */
6179 if (str[i])
6180 return -1; /* str is longer */
6181 return 0;
6182}
6183
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006184PyObject *PyUnicode_RichCompare(PyObject *left,
6185 PyObject *right,
6186 int op)
6187{
6188 int result;
6189
6190 result = PyUnicode_Compare(left, right);
6191 if (result == -1 && PyErr_Occurred())
6192 goto onError;
6193
6194 /* Convert the return value to a Boolean */
6195 switch (op) {
6196 case Py_EQ:
6197 result = (result == 0);
6198 break;
6199 case Py_NE:
6200 result = (result != 0);
6201 break;
6202 case Py_LE:
6203 result = (result <= 0);
6204 break;
6205 case Py_GE:
6206 result = (result >= 0);
6207 break;
6208 case Py_LT:
6209 result = (result == -1);
6210 break;
6211 case Py_GT:
6212 result = (result == 1);
6213 break;
6214 }
6215 return PyBool_FromLong(result);
6216
6217 onError:
6218
6219 /* Standard case
6220
6221 Type errors mean that PyUnicode_FromObject() could not convert
6222 one of the arguments (usually the right hand side) to Unicode,
6223 ie. we can't handle the comparison request. However, it is
6224 possible that the other object knows a comparison method, which
6225 is why we return Py_NotImplemented to give the other object a
6226 chance.
6227
6228 */
6229 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6230 PyErr_Clear();
6231 Py_INCREF(Py_NotImplemented);
6232 return Py_NotImplemented;
6233 }
6234 if (op != Py_EQ && op != Py_NE)
6235 return NULL;
6236
6237 /* Equality comparison.
6238
6239 This is a special case: we silence any PyExc_UnicodeDecodeError
6240 and instead turn it into a PyErr_UnicodeWarning.
6241
6242 */
6243 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6244 return NULL;
6245 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006246 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6247 (op == Py_EQ) ?
6248 "Unicode equal comparison "
6249 "failed to convert both arguments to Unicode - "
6250 "interpreting them as being unequal"
6251 :
6252 "Unicode unequal comparison "
6253 "failed to convert both arguments to Unicode - "
6254 "interpreting them as being unequal",
6255 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006256 return NULL;
6257 result = (op == Py_NE);
6258 return PyBool_FromLong(result);
6259}
6260
Guido van Rossum403d68b2000-03-13 15:55:09 +00006261int PyUnicode_Contains(PyObject *container,
6262 PyObject *element)
6263{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006264 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006265 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006266
6267 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006268 sub = PyUnicode_FromObject(element);
6269 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006270 PyErr_Format(PyExc_TypeError,
6271 "'in <string>' requires string as left operand, not %s",
6272 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006273 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006274 }
6275
Thomas Wouters477c8d52006-05-27 19:21:47 +00006276 str = PyUnicode_FromObject(container);
6277 if (!str) {
6278 Py_DECREF(sub);
6279 return -1;
6280 }
6281
6282 result = stringlib_contains_obj(str, sub);
6283
6284 Py_DECREF(str);
6285 Py_DECREF(sub);
6286
Guido van Rossum403d68b2000-03-13 15:55:09 +00006287 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006288}
6289
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290/* Concat to string or Unicode object giving a new Unicode object. */
6291
6292PyObject *PyUnicode_Concat(PyObject *left,
6293 PyObject *right)
6294{
6295 PyUnicodeObject *u = NULL, *v = NULL, *w;
6296
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006297 if (PyBytes_Check(left) || PyBytes_Check(right))
6298 return PyBytes_Concat(left, right);
6299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 /* Coerce the two arguments */
6301 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6302 if (u == NULL)
6303 goto onError;
6304 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6305 if (v == NULL)
6306 goto onError;
6307
6308 /* Shortcuts */
6309 if (v == unicode_empty) {
6310 Py_DECREF(v);
6311 return (PyObject *)u;
6312 }
6313 if (u == unicode_empty) {
6314 Py_DECREF(u);
6315 return (PyObject *)v;
6316 }
6317
6318 /* Concat the two Unicode strings */
6319 w = _PyUnicode_New(u->length + v->length);
6320 if (w == NULL)
6321 goto onError;
6322 Py_UNICODE_COPY(w->str, u->str, u->length);
6323 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6324
6325 Py_DECREF(u);
6326 Py_DECREF(v);
6327 return (PyObject *)w;
6328
6329onError:
6330 Py_XDECREF(u);
6331 Py_XDECREF(v);
6332 return NULL;
6333}
6334
Walter Dörwald1ab83302007-05-18 17:15:44 +00006335void
6336PyUnicode_Append(PyObject **pleft, PyObject *right)
6337{
6338 PyObject *new;
6339 if (*pleft == NULL)
6340 return;
6341 if (right == NULL || !PyUnicode_Check(*pleft)) {
6342 Py_DECREF(*pleft);
6343 *pleft = NULL;
6344 return;
6345 }
6346 new = PyUnicode_Concat(*pleft, right);
6347 Py_DECREF(*pleft);
6348 *pleft = new;
6349}
6350
6351void
6352PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6353{
6354 PyUnicode_Append(pleft, right);
6355 Py_XDECREF(right);
6356}
6357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006358PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359"S.count(sub[, start[, end]]) -> int\n\
6360\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006361Return the number of non-overlapping occurrences of substring sub in\n\
6362Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006363interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364
6365static PyObject *
6366unicode_count(PyUnicodeObject *self, PyObject *args)
6367{
6368 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006369 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006370 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 PyObject *result;
6372
Guido van Rossumb8872e62000-05-09 14:14:27 +00006373 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6374 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 return NULL;
6376
6377 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006378 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 if (substring == NULL)
6380 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006381
Thomas Wouters477c8d52006-05-27 19:21:47 +00006382 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
Thomas Wouters477c8d52006-05-27 19:21:47 +00006384 result = PyInt_FromSsize_t(
6385 stringlib_count(self->str + start, end - start,
6386 substring->str, substring->length)
6387 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388
6389 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006390
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 return result;
6392}
6393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006394PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006395"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006397Encodes S using the codec registered for encoding. encoding defaults\n\
6398to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006399handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6401'xmlcharrefreplace' as well as any other name registered with\n\
6402codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403
6404static PyObject *
6405unicode_encode(PyUnicodeObject *self, PyObject *args)
6406{
6407 char *encoding = NULL;
6408 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006409 PyObject *v;
6410
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6412 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006413 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006414 if (v == NULL)
6415 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006416 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006417 if (PyString_Check(v)) {
6418 /* Old codec, turn it into bytes */
6419 PyObject *b = PyBytes_FromObject(v);
6420 Py_DECREF(v);
6421 return b;
6422 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006423 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006424 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006425 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006426 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006427 Py_DECREF(v);
6428 return NULL;
6429 }
6430 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006431
6432 onError:
6433 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006434}
6435
6436PyDoc_STRVAR(decode__doc__,
6437"S.decode([encoding[,errors]]) -> string or unicode\n\
6438\n\
6439Decodes S using the codec registered for encoding. encoding defaults\n\
6440to the default encoding. errors may be given to set a different error\n\
6441handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6442a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6443as well as any other name registerd with codecs.register_error that is\n\
6444able to handle UnicodeDecodeErrors.");
6445
6446static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006447unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006448{
6449 char *encoding = NULL;
6450 char *errors = NULL;
6451 PyObject *v;
6452
6453 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6454 return NULL;
6455 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006456 if (v == NULL)
6457 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006458 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6459 PyErr_Format(PyExc_TypeError,
6460 "decoder did not return a string/unicode object "
6461 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006462 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006463 Py_DECREF(v);
6464 return NULL;
6465 }
6466 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006467
6468 onError:
6469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470}
6471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006472PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473"S.expandtabs([tabsize]) -> unicode\n\
6474\n\
6475Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006476If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477
6478static PyObject*
6479unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6480{
6481 Py_UNICODE *e;
6482 Py_UNICODE *p;
6483 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006484 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 PyUnicodeObject *u;
6486 int tabsize = 8;
6487
6488 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6489 return NULL;
6490
Thomas Wouters7e474022000-07-16 12:04:32 +00006491 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006492 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 e = self->str + self->length;
6494 for (p = self->str; p < e; p++)
6495 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006496 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006498 if (old_j > j) {
6499 PyErr_SetString(PyExc_OverflowError,
6500 "new string is too long");
6501 return NULL;
6502 }
6503 old_j = j;
6504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 }
6506 else {
6507 j++;
6508 if (*p == '\n' || *p == '\r') {
6509 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006510 old_j = j = 0;
6511 if (i < 0) {
6512 PyErr_SetString(PyExc_OverflowError,
6513 "new string is too long");
6514 return NULL;
6515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 }
6517 }
6518
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006519 if ((i + j) < 0) {
6520 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6521 return NULL;
6522 }
6523
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 /* Second pass: create output string and fill it */
6525 u = _PyUnicode_New(i + j);
6526 if (!u)
6527 return NULL;
6528
6529 j = 0;
6530 q = u->str;
6531
6532 for (p = self->str; p < e; p++)
6533 if (*p == '\t') {
6534 if (tabsize > 0) {
6535 i = tabsize - (j % tabsize);
6536 j += i;
6537 while (i--)
6538 *q++ = ' ';
6539 }
6540 }
6541 else {
6542 j++;
6543 *q++ = *p;
6544 if (*p == '\n' || *p == '\r')
6545 j = 0;
6546 }
6547
6548 return (PyObject*) u;
6549}
6550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006551PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552"S.find(sub [,start [,end]]) -> int\n\
6553\n\
6554Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006555such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556arguments start and end are interpreted as in slice notation.\n\
6557\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006558Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559
6560static PyObject *
6561unicode_find(PyUnicodeObject *self, PyObject *args)
6562{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006563 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006564 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006565 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006566 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
Guido van Rossumb8872e62000-05-09 14:14:27 +00006568 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6569 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006571 substring = PyUnicode_FromObject(substring);
6572 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 return NULL;
6574
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575 result = stringlib_find_slice(
6576 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6577 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6578 start, end
6579 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580
6581 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006582
6583 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584}
6585
6586static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006587unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588{
6589 if (index < 0 || index >= self->length) {
6590 PyErr_SetString(PyExc_IndexError, "string index out of range");
6591 return NULL;
6592 }
6593
6594 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6595}
6596
6597static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006598unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006600 /* Since Unicode objects compare equal to their UTF-8 string
6601 counterparts, we hash the UTF-8 string. */
6602 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6603 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604}
6605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006606PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607"S.index(sub [,start [,end]]) -> int\n\
6608\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006609Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610
6611static PyObject *
6612unicode_index(PyUnicodeObject *self, PyObject *args)
6613{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006614 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006615 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006616 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006617 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618
Guido van Rossumb8872e62000-05-09 14:14:27 +00006619 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6620 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006622 substring = PyUnicode_FromObject(substring);
6623 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 return NULL;
6625
Thomas Wouters477c8d52006-05-27 19:21:47 +00006626 result = stringlib_find_slice(
6627 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6628 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6629 start, end
6630 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631
6632 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006633
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 if (result < 0) {
6635 PyErr_SetString(PyExc_ValueError, "substring not found");
6636 return NULL;
6637 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006638
Martin v. Löwis18e16552006-02-15 17:27:45 +00006639 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640}
6641
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006642PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006643"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006645Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006646at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647
6648static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006649unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650{
6651 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6652 register const Py_UNICODE *e;
6653 int cased;
6654
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 /* Shortcut for single character strings */
6656 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006657 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006659 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006660 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006661 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006662
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 e = p + PyUnicode_GET_SIZE(self);
6664 cased = 0;
6665 for (; p < e; p++) {
6666 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006667
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006669 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 else if (!cased && Py_UNICODE_ISLOWER(ch))
6671 cased = 1;
6672 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006673 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674}
6675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006676PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006677"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006679Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006680at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681
6682static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006683unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684{
6685 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6686 register const Py_UNICODE *e;
6687 int cased;
6688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 /* Shortcut for single character strings */
6690 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006691 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006693 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006694 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006695 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006696
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 e = p + PyUnicode_GET_SIZE(self);
6698 cased = 0;
6699 for (; p < e; p++) {
6700 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006701
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006703 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 else if (!cased && Py_UNICODE_ISUPPER(ch))
6705 cased = 1;
6706 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006707 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708}
6709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006710PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006711"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006713Return True if S is a titlecased string and there is at least one\n\
6714character in S, i.e. upper- and titlecase characters may only\n\
6715follow uncased characters and lowercase characters only cased ones.\n\
6716Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717
6718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006719unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
6721 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6722 register const Py_UNICODE *e;
6723 int cased, previous_is_cased;
6724
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 /* Shortcut for single character strings */
6726 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006727 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6728 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006730 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006731 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006732 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 e = p + PyUnicode_GET_SIZE(self);
6735 cased = 0;
6736 previous_is_cased = 0;
6737 for (; p < e; p++) {
6738 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006739
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6741 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006742 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 previous_is_cased = 1;
6744 cased = 1;
6745 }
6746 else if (Py_UNICODE_ISLOWER(ch)) {
6747 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006748 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 previous_is_cased = 1;
6750 cased = 1;
6751 }
6752 else
6753 previous_is_cased = 0;
6754 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006755 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756}
6757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006758PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006759"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006761Return True if all characters in S are whitespace\n\
6762and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763
6764static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006765unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766{
6767 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6768 register const Py_UNICODE *e;
6769
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 /* Shortcut for single character strings */
6771 if (PyUnicode_GET_SIZE(self) == 1 &&
6772 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006773 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006775 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006776 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006778
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 e = p + PyUnicode_GET_SIZE(self);
6780 for (; p < e; p++) {
6781 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006782 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006784 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785}
6786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006787PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006788"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006789\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006790Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006791and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006792
6793static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006794unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006795{
6796 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6797 register const Py_UNICODE *e;
6798
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006799 /* Shortcut for single character strings */
6800 if (PyUnicode_GET_SIZE(self) == 1 &&
6801 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006802 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006803
6804 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006805 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006806 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006807
6808 e = p + PyUnicode_GET_SIZE(self);
6809 for (; p < e; p++) {
6810 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006811 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006812 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006813 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006814}
6815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006816PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006817"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006818\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006819Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006820and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006821
6822static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006823unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006824{
6825 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6826 register const Py_UNICODE *e;
6827
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006828 /* Shortcut for single character strings */
6829 if (PyUnicode_GET_SIZE(self) == 1 &&
6830 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006831 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006832
6833 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006834 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006835 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006836
6837 e = p + PyUnicode_GET_SIZE(self);
6838 for (; p < e; p++) {
6839 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006841 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006843}
6844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006845PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006846"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006848Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006849False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
6851static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006852unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
6854 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6855 register const Py_UNICODE *e;
6856
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 /* Shortcut for single character strings */
6858 if (PyUnicode_GET_SIZE(self) == 1 &&
6859 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006860 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006862 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006863 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006864 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006865
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 e = p + PyUnicode_GET_SIZE(self);
6867 for (; p < e; p++) {
6868 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006871 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872}
6873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006874PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006875"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006877Return True if all characters in S are digits\n\
6878and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879
6880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006881unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882{
6883 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6884 register const Py_UNICODE *e;
6885
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 /* Shortcut for single character strings */
6887 if (PyUnicode_GET_SIZE(self) == 1 &&
6888 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006889 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006891 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006892 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006894
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 e = p + PyUnicode_GET_SIZE(self);
6896 for (; p < e; p++) {
6897 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901}
6902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006903PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006907False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
6909static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006910unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911{
6912 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6913 register const Py_UNICODE *e;
6914
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 /* Shortcut for single character strings */
6916 if (PyUnicode_GET_SIZE(self) == 1 &&
6917 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006918 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006920 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006921 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006923
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 e = p + PyUnicode_GET_SIZE(self);
6925 for (; p < e; p++) {
6926 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006929 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930}
6931
Martin v. Löwis47383402007-08-15 07:32:56 +00006932int
6933PyUnicode_IsIdentifier(PyObject *self)
6934{
6935 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6936 register const Py_UNICODE *e;
6937
6938 /* Special case for empty strings */
6939 if (PyUnicode_GET_SIZE(self) == 0)
6940 return 0;
6941
6942 /* PEP 3131 says that the first character must be in
6943 XID_Start and subsequent characters in XID_Continue,
6944 and for the ASCII range, the 2.x rules apply (i.e
6945 start with letters and underscore, continue with
6946 letters, digits, underscore). However, given the current
6947 definition of XID_Start and XID_Continue, it is sufficient
6948 to check just for these, except that _ must be allowed
6949 as starting an identifier. */
6950 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6951 return 0;
6952
6953 e = p + PyUnicode_GET_SIZE(self);
6954 for (p++; p < e; p++) {
6955 if (!_PyUnicode_IsXidContinue(*p))
6956 return 0;
6957 }
6958 return 1;
6959}
6960
6961PyDoc_STRVAR(isidentifier__doc__,
6962"S.isidentifier() -> bool\n\
6963\n\
6964Return True if S is a valid identifier according\n\
6965to the language definition.");
6966
6967static PyObject*
6968unicode_isidentifier(PyObject *self)
6969{
6970 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6971}
6972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006973PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974"S.join(sequence) -> unicode\n\
6975\n\
6976Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006977sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978
6979static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006980unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006982 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983}
6984
Martin v. Löwis18e16552006-02-15 17:27:45 +00006985static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986unicode_length(PyUnicodeObject *self)
6987{
6988 return self->length;
6989}
6990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006991PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006992"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993\n\
6994Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006995done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997static PyObject *
6998unicode_ljust(PyUnicodeObject *self, PyObject *args)
6999{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007000 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007001 Py_UNICODE fillchar = ' ';
7002
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007003 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 return NULL;
7005
Tim Peters7a29bd52001-09-12 03:03:31 +00007006 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 Py_INCREF(self);
7008 return (PyObject*) self;
7009 }
7010
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007011 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012}
7013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007014PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015"S.lower() -> unicode\n\
7016\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007017Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018
7019static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007020unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 return fixup(self, fixlower);
7023}
7024
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007025#define LEFTSTRIP 0
7026#define RIGHTSTRIP 1
7027#define BOTHSTRIP 2
7028
7029/* Arrays indexed by above */
7030static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7031
7032#define STRIPNAME(i) (stripformat[i]+3)
7033
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007034/* externally visible for str.strip(unicode) */
7035PyObject *
7036_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7037{
7038 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007039 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007041 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7042 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007043
Thomas Wouters477c8d52006-05-27 19:21:47 +00007044 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7045
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007046 i = 0;
7047 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007048 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7049 i++;
7050 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007051 }
7052
7053 j = len;
7054 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055 do {
7056 j--;
7057 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7058 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007059 }
7060
7061 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007062 Py_INCREF(self);
7063 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007064 }
7065 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007066 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007067}
7068
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069
7070static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007071do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007073 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007074 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007075
7076 i = 0;
7077 if (striptype != RIGHTSTRIP) {
7078 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7079 i++;
7080 }
7081 }
7082
7083 j = len;
7084 if (striptype != LEFTSTRIP) {
7085 do {
7086 j--;
7087 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7088 j++;
7089 }
7090
7091 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7092 Py_INCREF(self);
7093 return (PyObject*)self;
7094 }
7095 else
7096 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097}
7098
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007099
7100static PyObject *
7101do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7102{
7103 PyObject *sep = NULL;
7104
7105 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7106 return NULL;
7107
7108 if (sep != NULL && sep != Py_None) {
7109 if (PyUnicode_Check(sep))
7110 return _PyUnicode_XStrip(self, striptype, sep);
7111 else if (PyString_Check(sep)) {
7112 PyObject *res;
7113 sep = PyUnicode_FromObject(sep);
7114 if (sep==NULL)
7115 return NULL;
7116 res = _PyUnicode_XStrip(self, striptype, sep);
7117 Py_DECREF(sep);
7118 return res;
7119 }
7120 else {
7121 PyErr_Format(PyExc_TypeError,
7122 "%s arg must be None, unicode or str",
7123 STRIPNAME(striptype));
7124 return NULL;
7125 }
7126 }
7127
7128 return do_strip(self, striptype);
7129}
7130
7131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007132PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007133"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007134\n\
7135Return a copy of the string S with leading and trailing\n\
7136whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007137If chars is given and not None, remove characters in chars instead.\n\
7138If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007139
7140static PyObject *
7141unicode_strip(PyUnicodeObject *self, PyObject *args)
7142{
7143 if (PyTuple_GET_SIZE(args) == 0)
7144 return do_strip(self, BOTHSTRIP); /* Common case */
7145 else
7146 return do_argstrip(self, BOTHSTRIP, args);
7147}
7148
7149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007151"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007152\n\
7153Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007154If chars is given and not None, remove characters in chars instead.\n\
7155If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007156
7157static PyObject *
7158unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7159{
7160 if (PyTuple_GET_SIZE(args) == 0)
7161 return do_strip(self, LEFTSTRIP); /* Common case */
7162 else
7163 return do_argstrip(self, LEFTSTRIP, args);
7164}
7165
7166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007167PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007168"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007169\n\
7170Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007171If chars is given and not None, remove characters in chars instead.\n\
7172If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007173
7174static PyObject *
7175unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7176{
7177 if (PyTuple_GET_SIZE(args) == 0)
7178 return do_strip(self, RIGHTSTRIP); /* Common case */
7179 else
7180 return do_argstrip(self, RIGHTSTRIP, args);
7181}
7182
7183
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007185unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186{
7187 PyUnicodeObject *u;
7188 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007189 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007190 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
7192 if (len < 0)
7193 len = 0;
7194
Tim Peters7a29bd52001-09-12 03:03:31 +00007195 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196 /* no repeat, return original string */
7197 Py_INCREF(str);
7198 return (PyObject*) str;
7199 }
Tim Peters8f422462000-09-09 06:13:41 +00007200
7201 /* ensure # of chars needed doesn't overflow int and # of bytes
7202 * needed doesn't overflow size_t
7203 */
7204 nchars = len * str->length;
7205 if (len && nchars / len != str->length) {
7206 PyErr_SetString(PyExc_OverflowError,
7207 "repeated string is too long");
7208 return NULL;
7209 }
7210 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7211 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7212 PyErr_SetString(PyExc_OverflowError,
7213 "repeated string is too long");
7214 return NULL;
7215 }
7216 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 if (!u)
7218 return NULL;
7219
7220 p = u->str;
7221
Thomas Wouters477c8d52006-05-27 19:21:47 +00007222 if (str->length == 1 && len > 0) {
7223 Py_UNICODE_FILL(p, str->str[0], len);
7224 } else {
7225 Py_ssize_t done = 0; /* number of characters copied this far */
7226 if (done < nchars) {
7227 Py_UNICODE_COPY(p, str->str, str->length);
7228 done = str->length;
7229 }
7230 while (done < nchars) {
7231 int n = (done <= nchars-done) ? done : nchars-done;
7232 Py_UNICODE_COPY(p+done, p, n);
7233 done += n;
7234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 }
7236
7237 return (PyObject*) u;
7238}
7239
7240PyObject *PyUnicode_Replace(PyObject *obj,
7241 PyObject *subobj,
7242 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007243 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244{
7245 PyObject *self;
7246 PyObject *str1;
7247 PyObject *str2;
7248 PyObject *result;
7249
7250 self = PyUnicode_FromObject(obj);
7251 if (self == NULL)
7252 return NULL;
7253 str1 = PyUnicode_FromObject(subobj);
7254 if (str1 == NULL) {
7255 Py_DECREF(self);
7256 return NULL;
7257 }
7258 str2 = PyUnicode_FromObject(replobj);
7259 if (str2 == NULL) {
7260 Py_DECREF(self);
7261 Py_DECREF(str1);
7262 return NULL;
7263 }
Tim Petersced69f82003-09-16 20:30:58 +00007264 result = replace((PyUnicodeObject *)self,
7265 (PyUnicodeObject *)str1,
7266 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 maxcount);
7268 Py_DECREF(self);
7269 Py_DECREF(str1);
7270 Py_DECREF(str2);
7271 return result;
7272}
7273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007274PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275"S.replace (old, new[, maxsplit]) -> unicode\n\
7276\n\
7277Return a copy of S with all occurrences of substring\n\
7278old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007279given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280
7281static PyObject*
7282unicode_replace(PyUnicodeObject *self, PyObject *args)
7283{
7284 PyUnicodeObject *str1;
7285 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007286 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287 PyObject *result;
7288
Martin v. Löwis18e16552006-02-15 17:27:45 +00007289 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290 return NULL;
7291 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7292 if (str1 == NULL)
7293 return NULL;
7294 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007295 if (str2 == NULL) {
7296 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007298 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299
7300 result = replace(self, str1, str2, maxcount);
7301
7302 Py_DECREF(str1);
7303 Py_DECREF(str2);
7304 return result;
7305}
7306
7307static
7308PyObject *unicode_repr(PyObject *unicode)
7309{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007310 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007311 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007312 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7313 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7314
7315 /* XXX(nnorwitz): rather than over-allocating, it would be
7316 better to choose a different scheme. Perhaps scan the
7317 first N-chars of the string and allocate based on that size.
7318 */
7319 /* Initial allocation is based on the longest-possible unichr
7320 escape.
7321
7322 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7323 unichr, so in this case it's the longest unichr escape. In
7324 narrow (UTF-16) builds this is five chars per source unichr
7325 since there are two unichrs in the surrogate pair, so in narrow
7326 (UTF-16) builds it's not the longest unichr escape.
7327
7328 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7329 so in the narrow (UTF-16) build case it's the longest unichr
7330 escape.
7331 */
7332
Walter Dörwald1ab83302007-05-18 17:15:44 +00007333 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007334 2 /* quotes */
7335#ifdef Py_UNICODE_WIDE
7336 + 10*size
7337#else
7338 + 6*size
7339#endif
7340 + 1);
7341 if (repr == NULL)
7342 return NULL;
7343
Walter Dörwald1ab83302007-05-18 17:15:44 +00007344 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007345
7346 /* Add quote */
7347 *p++ = (findchar(s, size, '\'') &&
7348 !findchar(s, size, '"')) ? '"' : '\'';
7349 while (size-- > 0) {
7350 Py_UNICODE ch = *s++;
7351
7352 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007353 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007354 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007355 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007356 continue;
7357 }
7358
7359#ifdef Py_UNICODE_WIDE
7360 /* Map 21-bit characters to '\U00xxxxxx' */
7361 else if (ch >= 0x10000) {
7362 *p++ = '\\';
7363 *p++ = 'U';
7364 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7365 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7366 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7367 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7368 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7369 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7370 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7371 *p++ = hexdigits[ch & 0x0000000F];
7372 continue;
7373 }
7374#else
7375 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7376 else if (ch >= 0xD800 && ch < 0xDC00) {
7377 Py_UNICODE ch2;
7378 Py_UCS4 ucs;
7379
7380 ch2 = *s++;
7381 size--;
7382 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7383 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7384 *p++ = '\\';
7385 *p++ = 'U';
7386 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7387 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7388 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7389 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7390 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7391 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7392 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7393 *p++ = hexdigits[ucs & 0x0000000F];
7394 continue;
7395 }
7396 /* Fall through: isolated surrogates are copied as-is */
7397 s--;
7398 size++;
7399 }
7400#endif
7401
7402 /* Map 16-bit characters to '\uxxxx' */
7403 if (ch >= 256) {
7404 *p++ = '\\';
7405 *p++ = 'u';
7406 *p++ = hexdigits[(ch >> 12) & 0x000F];
7407 *p++ = hexdigits[(ch >> 8) & 0x000F];
7408 *p++ = hexdigits[(ch >> 4) & 0x000F];
7409 *p++ = hexdigits[ch & 0x000F];
7410 }
7411
7412 /* Map special whitespace to '\t', \n', '\r' */
7413 else if (ch == '\t') {
7414 *p++ = '\\';
7415 *p++ = 't';
7416 }
7417 else if (ch == '\n') {
7418 *p++ = '\\';
7419 *p++ = 'n';
7420 }
7421 else if (ch == '\r') {
7422 *p++ = '\\';
7423 *p++ = 'r';
7424 }
7425
7426 /* Map non-printable US ASCII to '\xhh' */
7427 else if (ch < ' ' || ch >= 0x7F) {
7428 *p++ = '\\';
7429 *p++ = 'x';
7430 *p++ = hexdigits[(ch >> 4) & 0x000F];
7431 *p++ = hexdigits[ch & 0x000F];
7432 }
7433
7434 /* Copy everything else as-is */
7435 else
7436 *p++ = (char) ch;
7437 }
7438 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007439 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007440
7441 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007442 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007443 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444}
7445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007446PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447"S.rfind(sub [,start [,end]]) -> int\n\
7448\n\
7449Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007450such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451arguments start and end are interpreted as in slice notation.\n\
7452\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007453Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454
7455static PyObject *
7456unicode_rfind(PyUnicodeObject *self, PyObject *args)
7457{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007458 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007459 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007460 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007461 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462
Guido van Rossumb8872e62000-05-09 14:14:27 +00007463 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7464 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007466 substring = PyUnicode_FromObject(substring);
7467 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 return NULL;
7469
Thomas Wouters477c8d52006-05-27 19:21:47 +00007470 result = stringlib_rfind_slice(
7471 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7472 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7473 start, end
7474 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475
7476 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007477
7478 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479}
7480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007481PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482"S.rindex(sub [,start [,end]]) -> int\n\
7483\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007484Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485
7486static PyObject *
7487unicode_rindex(PyUnicodeObject *self, PyObject *args)
7488{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007489 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007490 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007491 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007492 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493
Guido van Rossumb8872e62000-05-09 14:14:27 +00007494 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7495 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007497 substring = PyUnicode_FromObject(substring);
7498 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 return NULL;
7500
Thomas Wouters477c8d52006-05-27 19:21:47 +00007501 result = stringlib_rfind_slice(
7502 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7503 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7504 start, end
7505 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506
7507 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007508
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 if (result < 0) {
7510 PyErr_SetString(PyExc_ValueError, "substring not found");
7511 return NULL;
7512 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007513 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514}
7515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007516PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007517"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518\n\
7519Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007520done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521
7522static PyObject *
7523unicode_rjust(PyUnicodeObject *self, PyObject *args)
7524{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007525 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007526 Py_UNICODE fillchar = ' ';
7527
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007528 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 return NULL;
7530
Tim Peters7a29bd52001-09-12 03:03:31 +00007531 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 Py_INCREF(self);
7533 return (PyObject*) self;
7534 }
7535
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007536 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537}
7538
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541{
7542 /* standard clamping */
7543 if (start < 0)
7544 start = 0;
7545 if (end < 0)
7546 end = 0;
7547 if (end > self->length)
7548 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007549 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 /* full slice, return original string */
7551 Py_INCREF(self);
7552 return (PyObject*) self;
7553 }
7554 if (start > end)
7555 start = end;
7556 /* copy slice */
7557 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7558 end - start);
7559}
7560
7561PyObject *PyUnicode_Split(PyObject *s,
7562 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564{
7565 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007566
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 s = PyUnicode_FromObject(s);
7568 if (s == NULL)
7569 return NULL;
7570 if (sep != NULL) {
7571 sep = PyUnicode_FromObject(sep);
7572 if (sep == NULL) {
7573 Py_DECREF(s);
7574 return NULL;
7575 }
7576 }
7577
7578 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7579
7580 Py_DECREF(s);
7581 Py_XDECREF(sep);
7582 return result;
7583}
7584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007585PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586"S.split([sep [,maxsplit]]) -> list of strings\n\
7587\n\
7588Return a list of the words in S, using sep as the\n\
7589delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007590splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007591any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
7593static PyObject*
7594unicode_split(PyUnicodeObject *self, PyObject *args)
7595{
7596 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007597 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598
Martin v. Löwis18e16552006-02-15 17:27:45 +00007599 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 return NULL;
7601
7602 if (substring == Py_None)
7603 return split(self, NULL, maxcount);
7604 else if (PyUnicode_Check(substring))
7605 return split(self, (PyUnicodeObject *)substring, maxcount);
7606 else
7607 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7608}
7609
Thomas Wouters477c8d52006-05-27 19:21:47 +00007610PyObject *
7611PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7612{
7613 PyObject* str_obj;
7614 PyObject* sep_obj;
7615 PyObject* out;
7616
7617 str_obj = PyUnicode_FromObject(str_in);
7618 if (!str_obj)
7619 return NULL;
7620 sep_obj = PyUnicode_FromObject(sep_in);
7621 if (!sep_obj) {
7622 Py_DECREF(str_obj);
7623 return NULL;
7624 }
7625
7626 out = stringlib_partition(
7627 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7628 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7629 );
7630
7631 Py_DECREF(sep_obj);
7632 Py_DECREF(str_obj);
7633
7634 return out;
7635}
7636
7637
7638PyObject *
7639PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7640{
7641 PyObject* str_obj;
7642 PyObject* sep_obj;
7643 PyObject* out;
7644
7645 str_obj = PyUnicode_FromObject(str_in);
7646 if (!str_obj)
7647 return NULL;
7648 sep_obj = PyUnicode_FromObject(sep_in);
7649 if (!sep_obj) {
7650 Py_DECREF(str_obj);
7651 return NULL;
7652 }
7653
7654 out = stringlib_rpartition(
7655 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7656 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7657 );
7658
7659 Py_DECREF(sep_obj);
7660 Py_DECREF(str_obj);
7661
7662 return out;
7663}
7664
7665PyDoc_STRVAR(partition__doc__,
7666"S.partition(sep) -> (head, sep, tail)\n\
7667\n\
7668Searches for the separator sep in S, and returns the part before it,\n\
7669the separator itself, and the part after it. If the separator is not\n\
7670found, returns S and two empty strings.");
7671
7672static PyObject*
7673unicode_partition(PyUnicodeObject *self, PyObject *separator)
7674{
7675 return PyUnicode_Partition((PyObject *)self, separator);
7676}
7677
7678PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007679"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007680\n\
7681Searches for the separator sep in S, starting at the end of S, and returns\n\
7682the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007683separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007684
7685static PyObject*
7686unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7687{
7688 return PyUnicode_RPartition((PyObject *)self, separator);
7689}
7690
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007691PyObject *PyUnicode_RSplit(PyObject *s,
7692 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007694{
7695 PyObject *result;
7696
7697 s = PyUnicode_FromObject(s);
7698 if (s == NULL)
7699 return NULL;
7700 if (sep != NULL) {
7701 sep = PyUnicode_FromObject(sep);
7702 if (sep == NULL) {
7703 Py_DECREF(s);
7704 return NULL;
7705 }
7706 }
7707
7708 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7709
7710 Py_DECREF(s);
7711 Py_XDECREF(sep);
7712 return result;
7713}
7714
7715PyDoc_STRVAR(rsplit__doc__,
7716"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7717\n\
7718Return a list of the words in S, using sep as the\n\
7719delimiter string, starting at the end of the string and\n\
7720working to the front. If maxsplit is given, at most maxsplit\n\
7721splits are done. If sep is not specified, any whitespace string\n\
7722is a separator.");
7723
7724static PyObject*
7725unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7726{
7727 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007728 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007729
Martin v. Löwis18e16552006-02-15 17:27:45 +00007730 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007731 return NULL;
7732
7733 if (substring == Py_None)
7734 return rsplit(self, NULL, maxcount);
7735 else if (PyUnicode_Check(substring))
7736 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7737 else
7738 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7739}
7740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007742"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743\n\
7744Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007745Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007746is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747
7748static PyObject*
7749unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7750{
Guido van Rossum86662912000-04-11 15:38:46 +00007751 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
Guido van Rossum86662912000-04-11 15:38:46 +00007753 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 return NULL;
7755
Guido van Rossum86662912000-04-11 15:38:46 +00007756 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757}
7758
7759static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007760PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761{
Walter Dörwald346737f2007-05-31 10:44:43 +00007762 if (PyUnicode_CheckExact(self)) {
7763 Py_INCREF(self);
7764 return self;
7765 } else
7766 /* Subtype -- return genuine unicode string with the same value. */
7767 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7768 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769}
7770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007771PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772"S.swapcase() -> unicode\n\
7773\n\
7774Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007775and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
7777static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007778unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 return fixup(self, fixswapcase);
7781}
7782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007783PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784"S.translate(table) -> unicode\n\
7785\n\
7786Return a copy of the string S, where all characters have been mapped\n\
7787through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007788Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7789Unmapped characters are left untouched. Characters mapped to None\n\
7790are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791
7792static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007793unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794{
Tim Petersced69f82003-09-16 20:30:58 +00007795 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007797 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 "ignore");
7799}
7800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007801PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802"S.upper() -> unicode\n\
7803\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007804Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805
7806static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007807unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809 return fixup(self, fixupper);
7810}
7811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007812PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813"S.zfill(width) -> unicode\n\
7814\n\
7815Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007816of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817
7818static PyObject *
7819unicode_zfill(PyUnicodeObject *self, PyObject *args)
7820{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007821 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 PyUnicodeObject *u;
7823
Martin v. Löwis18e16552006-02-15 17:27:45 +00007824 Py_ssize_t width;
7825 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 return NULL;
7827
7828 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007829 if (PyUnicode_CheckExact(self)) {
7830 Py_INCREF(self);
7831 return (PyObject*) self;
7832 }
7833 else
7834 return PyUnicode_FromUnicode(
7835 PyUnicode_AS_UNICODE(self),
7836 PyUnicode_GET_SIZE(self)
7837 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 }
7839
7840 fill = width - self->length;
7841
7842 u = pad(self, fill, 0, '0');
7843
Walter Dörwald068325e2002-04-15 13:36:47 +00007844 if (u == NULL)
7845 return NULL;
7846
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 if (u->str[fill] == '+' || u->str[fill] == '-') {
7848 /* move sign to beginning of string */
7849 u->str[0] = u->str[fill];
7850 u->str[fill] = '0';
7851 }
7852
7853 return (PyObject*) u;
7854}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855
7856#if 0
7857static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007858unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 return PyInt_FromLong(unicode_freelist_size);
7861}
7862#endif
7863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007864PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007865"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007867Return True if S starts with the specified prefix, False otherwise.\n\
7868With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007869With optional end, stop comparing S at that position.\n\
7870prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871
7872static PyObject *
7873unicode_startswith(PyUnicodeObject *self,
7874 PyObject *args)
7875{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007876 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007878 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007879 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007880 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007882 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007883 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007885 if (PyTuple_Check(subobj)) {
7886 Py_ssize_t i;
7887 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7888 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7889 PyTuple_GET_ITEM(subobj, i));
7890 if (substring == NULL)
7891 return NULL;
7892 result = tailmatch(self, substring, start, end, -1);
7893 Py_DECREF(substring);
7894 if (result) {
7895 Py_RETURN_TRUE;
7896 }
7897 }
7898 /* nothing matched */
7899 Py_RETURN_FALSE;
7900 }
7901 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007903 return NULL;
7904 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007906 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907}
7908
7909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007910PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007911"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007913Return True if S ends with the specified suffix, False otherwise.\n\
7914With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007915With optional end, stop comparing S at that position.\n\
7916suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917
7918static PyObject *
7919unicode_endswith(PyUnicodeObject *self,
7920 PyObject *args)
7921{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007922 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007924 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007925 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007926 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007928 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7929 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007931 if (PyTuple_Check(subobj)) {
7932 Py_ssize_t i;
7933 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7934 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7935 PyTuple_GET_ITEM(subobj, i));
7936 if (substring == NULL)
7937 return NULL;
7938 result = tailmatch(self, substring, start, end, +1);
7939 Py_DECREF(substring);
7940 if (result) {
7941 Py_RETURN_TRUE;
7942 }
7943 }
7944 Py_RETURN_FALSE;
7945 }
7946 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007950 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007952 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953}
7954
Eric Smith8c663262007-08-25 02:26:07 +00007955#include "stringlib/string_format.h"
7956
7957PyDoc_STRVAR(format__doc__,
7958"S.format(*args, **kwargs) -> unicode\n\
7959\n\
7960");
7961
7962static PyObject *
7963unicode_format(PyObject *self, PyObject *args, PyObject *kwds)
7964{
7965 /* this calls into stringlib/string_format.h because it can be
7966 included for either string or unicode. this is needed for
7967 python 2.6. */
7968 return do_string_format(self, args, kwds);
7969}
7970
7971
7972PyDoc_STRVAR(p_format__doc__,
7973"S.__format__(format_spec) -> unicode\n\
7974\n\
7975");
7976
7977static PyObject *
7978unicode__format__(PyObject *self, PyObject *args)
7979{
7980 return unicode_unicode__format__(self, args);
7981}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007983
7984static PyObject *
7985unicode_getnewargs(PyUnicodeObject *v)
7986{
7987 return Py_BuildValue("(u#)", v->str, v->length);
7988}
7989
7990
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991static PyMethodDef unicode_methods[] = {
7992
7993 /* Order is according to common usage: often used methods should
7994 appear first, since lookup is done sequentially. */
7995
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007996 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7997 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7998 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007999 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008000 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8001 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8002 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8003 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8004 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8005 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8006 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008007 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008008 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8009 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8010 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008011 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00008012 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008013/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8014 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8015 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8016 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008017 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008018 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008019 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008020 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008021 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8022 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8023 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8024 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8025 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8026 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8027 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8028 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8029 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8030 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8031 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8032 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8033 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8034 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008035 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008036 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith8c663262007-08-25 02:26:07 +00008037 {"format", (PyCFunction) unicode_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8038 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008039#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008040 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041#endif
8042
8043#if 0
8044 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008045 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046#endif
8047
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008048 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 {NULL, NULL}
8050};
8051
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008052static PyObject *
8053unicode_mod(PyObject *v, PyObject *w)
8054{
8055 if (!PyUnicode_Check(v)) {
8056 Py_INCREF(Py_NotImplemented);
8057 return Py_NotImplemented;
8058 }
8059 return PyUnicode_Format(v, w);
8060}
8061
8062static PyNumberMethods unicode_as_number = {
8063 0, /*nb_add*/
8064 0, /*nb_subtract*/
8065 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008066 unicode_mod, /*nb_remainder*/
8067};
8068
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008070 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008071 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008072 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8073 (ssizeargfunc) unicode_getitem, /* sq_item */
8074 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075 0, /* sq_ass_item */
8076 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008077 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078};
8079
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008080static PyObject*
8081unicode_subscript(PyUnicodeObject* self, PyObject* item)
8082{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008083 if (PyIndex_Check(item)) {
8084 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008085 if (i == -1 && PyErr_Occurred())
8086 return NULL;
8087 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008088 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008089 return unicode_getitem(self, i);
8090 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008091 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008092 Py_UNICODE* source_buf;
8093 Py_UNICODE* result_buf;
8094 PyObject* result;
8095
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008096 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008097 &start, &stop, &step, &slicelength) < 0) {
8098 return NULL;
8099 }
8100
8101 if (slicelength <= 0) {
8102 return PyUnicode_FromUnicode(NULL, 0);
8103 } else {
8104 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008105 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8106 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008107
8108 if (result_buf == NULL)
8109 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008110
8111 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8112 result_buf[i] = source_buf[cur];
8113 }
Tim Petersced69f82003-09-16 20:30:58 +00008114
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008115 result = PyUnicode_FromUnicode(result_buf, slicelength);
8116 PyMem_FREE(result_buf);
8117 return result;
8118 }
8119 } else {
8120 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8121 return NULL;
8122 }
8123}
8124
8125static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008126 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008127 (binaryfunc)unicode_subscript, /* mp_subscript */
8128 (objobjargproc)0, /* mp_ass_subscript */
8129};
8130
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131
8132static int
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008133unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008136 if (flags & PyBUF_CHARACTER) {
8137 PyObject *str;
8138
8139 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8140 if (str == NULL) return -1;
8141 return PyBuffer_FillInfo(view, (void *)PyString_AS_STRING(str),
8142 PyString_GET_SIZE(str), 1, flags);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 }
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008144 else {
8145 return PyBuffer_FillInfo(view, (void *)self->str,
8146 PyUnicode_GET_DATA_SIZE(self), 1, flags);
8147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148}
8149
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008150
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151/* Helpers for PyUnicode_Format() */
8152
8153static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008154getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 if (argidx < arglen) {
8158 (*p_argidx)++;
8159 if (arglen < 0)
8160 return args;
8161 else
8162 return PyTuple_GetItem(args, argidx);
8163 }
8164 PyErr_SetString(PyExc_TypeError,
8165 "not enough arguments for format string");
8166 return NULL;
8167}
8168
8169#define F_LJUST (1<<0)
8170#define F_SIGN (1<<1)
8171#define F_BLANK (1<<2)
8172#define F_ALT (1<<3)
8173#define F_ZERO (1<<4)
8174
Martin v. Löwis18e16552006-02-15 17:27:45 +00008175static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008176strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008178 register Py_ssize_t i;
8179 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 for (i = len - 1; i >= 0; i--)
8181 buffer[i] = (Py_UNICODE) charbuffer[i];
8182
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183 return len;
8184}
8185
Neal Norwitzfc76d632006-01-10 06:03:13 +00008186static int
8187doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8188{
Tim Peters15231542006-02-16 01:08:01 +00008189 Py_ssize_t result;
8190
Neal Norwitzfc76d632006-01-10 06:03:13 +00008191 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008192 result = strtounicode(buffer, (char *)buffer);
8193 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008194}
8195
8196static int
8197longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8198{
Tim Peters15231542006-02-16 01:08:01 +00008199 Py_ssize_t result;
8200
Neal Norwitzfc76d632006-01-10 06:03:13 +00008201 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008202 result = strtounicode(buffer, (char *)buffer);
8203 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008204}
8205
Guido van Rossum078151d2002-08-11 04:24:12 +00008206/* XXX To save some code duplication, formatfloat/long/int could have been
8207 shared with stringobject.c, converting from 8-bit to Unicode after the
8208 formatting is done. */
8209
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210static int
8211formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008212 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 int flags,
8214 int prec,
8215 int type,
8216 PyObject *v)
8217{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008218 /* fmt = '%#.' + `prec` + `type`
8219 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220 char fmt[20];
8221 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008222
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 x = PyFloat_AsDouble(v);
8224 if (x == -1.0 && PyErr_Occurred())
8225 return -1;
8226 if (prec < 0)
8227 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8229 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008230 /* Worst case length calc to ensure no buffer overrun:
8231
8232 'g' formats:
8233 fmt = %#.<prec>g
8234 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8235 for any double rep.)
8236 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8237
8238 'f' formats:
8239 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8240 len = 1 + 50 + 1 + prec = 52 + prec
8241
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008242 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008243 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008244
8245 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008246 if (((type == 'g' || type == 'G') &&
8247 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008248 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008249 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008250 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008251 return -1;
8252 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008253 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8254 (flags&F_ALT) ? "#" : "",
8255 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008256 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257}
8258
Tim Peters38fd5b62000-09-21 05:43:11 +00008259static PyObject*
8260formatlong(PyObject *val, int flags, int prec, int type)
8261{
8262 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008263 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008264 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008265 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008266
8267 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8268 if (!str)
8269 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008270 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008271 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008272 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008273}
8274
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275static int
8276formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008277 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 int flags,
8279 int prec,
8280 int type,
8281 PyObject *v)
8282{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008283 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008284 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8285 * + 1 + 1
8286 * = 24
8287 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008288 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008289 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 long x;
8291
8292 x = PyInt_AsLong(v);
8293 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008294 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008295 if (x < 0 && type == 'u') {
8296 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008297 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008298 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8299 sign = "-";
8300 else
8301 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008303 prec = 1;
8304
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008305 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8306 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008307 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008308 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008309 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008310 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008311 return -1;
8312 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008313
8314 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008315 (type == 'x' || type == 'X' || type == 'o')) {
8316 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008317 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008318 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008319 * - when 0 is being converted, the C standard leaves off
8320 * the '0x' or '0X', which is inconsistent with other
8321 * %#x/%#X conversions and inconsistent with Python's
8322 * hex() function
8323 * - there are platforms that violate the standard and
8324 * convert 0 with the '0x' or '0X'
8325 * (Metrowerks, Compaq Tru64)
8326 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008327 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008328 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008329 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008330 * We can achieve the desired consistency by inserting our
8331 * own '0x' or '0X' prefix, and substituting %x/%X in place
8332 * of %#x/%#X.
8333 *
8334 * Note that this is the same approach as used in
8335 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008336 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008337 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8338 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008339 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008340 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008341 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8342 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008343 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008344 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008345 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008346 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008347 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008348 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349}
8350
8351static int
8352formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008353 size_t buflen,
8354 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008356 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008357 if (PyUnicode_Check(v)) {
8358 if (PyUnicode_GET_SIZE(v) != 1)
8359 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008361 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008363 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008364 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008365 goto onError;
8366 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368
8369 else {
8370 /* Integer input truncated to a character */
8371 long x;
8372 x = PyInt_AsLong(v);
8373 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008374 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008375#ifdef Py_UNICODE_WIDE
8376 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008377 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008378 "%c arg not in range(0x110000) "
8379 "(wide Python build)");
8380 return -1;
8381 }
8382#else
8383 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008384 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008385 "%c arg not in range(0x10000) "
8386 "(narrow Python build)");
8387 return -1;
8388 }
8389#endif
8390 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 }
8392 buf[1] = '\0';
8393 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008394
8395 onError:
8396 PyErr_SetString(PyExc_TypeError,
8397 "%c requires int or char");
8398 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399}
8400
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008401/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8402
8403 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8404 chars are formatted. XXX This is a magic number. Each formatting
8405 routine does bounds checking to ensure no overflow, but a better
8406 solution may be to malloc a buffer of appropriate size for each
8407 format. For now, the current solution is sufficient.
8408*/
8409#define FORMATBUFLEN (size_t)120
8410
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411PyObject *PyUnicode_Format(PyObject *format,
8412 PyObject *args)
8413{
8414 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008415 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416 int args_owned = 0;
8417 PyUnicodeObject *result = NULL;
8418 PyObject *dict = NULL;
8419 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008420
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421 if (format == NULL || args == NULL) {
8422 PyErr_BadInternalCall();
8423 return NULL;
8424 }
8425 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008426 if (uformat == NULL)
8427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 fmt = PyUnicode_AS_UNICODE(uformat);
8429 fmtcnt = PyUnicode_GET_SIZE(uformat);
8430
8431 reslen = rescnt = fmtcnt + 100;
8432 result = _PyUnicode_New(reslen);
8433 if (result == NULL)
8434 goto onError;
8435 res = PyUnicode_AS_UNICODE(result);
8436
8437 if (PyTuple_Check(args)) {
8438 arglen = PyTuple_Size(args);
8439 argidx = 0;
8440 }
8441 else {
8442 arglen = -1;
8443 argidx = -2;
8444 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008445 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008446 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 dict = args;
8448
8449 while (--fmtcnt >= 0) {
8450 if (*fmt != '%') {
8451 if (--rescnt < 0) {
8452 rescnt = fmtcnt + 100;
8453 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008454 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008455 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8457 --rescnt;
8458 }
8459 *res++ = *fmt++;
8460 }
8461 else {
8462 /* Got a format specifier */
8463 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008464 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 Py_UNICODE c = '\0';
8467 Py_UNICODE fill;
8468 PyObject *v = NULL;
8469 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008470 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008472 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008473 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474
8475 fmt++;
8476 if (*fmt == '(') {
8477 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008478 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 PyObject *key;
8480 int pcount = 1;
8481
8482 if (dict == NULL) {
8483 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008484 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 goto onError;
8486 }
8487 ++fmt;
8488 --fmtcnt;
8489 keystart = fmt;
8490 /* Skip over balanced parentheses */
8491 while (pcount > 0 && --fmtcnt >= 0) {
8492 if (*fmt == ')')
8493 --pcount;
8494 else if (*fmt == '(')
8495 ++pcount;
8496 fmt++;
8497 }
8498 keylen = fmt - keystart - 1;
8499 if (fmtcnt < 0 || pcount > 0) {
8500 PyErr_SetString(PyExc_ValueError,
8501 "incomplete format key");
8502 goto onError;
8503 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008504#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008505 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 then looked up since Python uses strings to hold
8507 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008508 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509 key = PyUnicode_EncodeUTF8(keystart,
8510 keylen,
8511 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008512#else
8513 key = PyUnicode_FromUnicode(keystart, keylen);
8514#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 if (key == NULL)
8516 goto onError;
8517 if (args_owned) {
8518 Py_DECREF(args);
8519 args_owned = 0;
8520 }
8521 args = PyObject_GetItem(dict, key);
8522 Py_DECREF(key);
8523 if (args == NULL) {
8524 goto onError;
8525 }
8526 args_owned = 1;
8527 arglen = -1;
8528 argidx = -2;
8529 }
8530 while (--fmtcnt >= 0) {
8531 switch (c = *fmt++) {
8532 case '-': flags |= F_LJUST; continue;
8533 case '+': flags |= F_SIGN; continue;
8534 case ' ': flags |= F_BLANK; continue;
8535 case '#': flags |= F_ALT; continue;
8536 case '0': flags |= F_ZERO; continue;
8537 }
8538 break;
8539 }
8540 if (c == '*') {
8541 v = getnextarg(args, arglen, &argidx);
8542 if (v == NULL)
8543 goto onError;
8544 if (!PyInt_Check(v)) {
8545 PyErr_SetString(PyExc_TypeError,
8546 "* wants int");
8547 goto onError;
8548 }
8549 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008550 if (width == -1 && PyErr_Occurred())
8551 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 if (width < 0) {
8553 flags |= F_LJUST;
8554 width = -width;
8555 }
8556 if (--fmtcnt >= 0)
8557 c = *fmt++;
8558 }
8559 else if (c >= '0' && c <= '9') {
8560 width = c - '0';
8561 while (--fmtcnt >= 0) {
8562 c = *fmt++;
8563 if (c < '0' || c > '9')
8564 break;
8565 if ((width*10) / 10 != width) {
8566 PyErr_SetString(PyExc_ValueError,
8567 "width too big");
8568 goto onError;
8569 }
8570 width = width*10 + (c - '0');
8571 }
8572 }
8573 if (c == '.') {
8574 prec = 0;
8575 if (--fmtcnt >= 0)
8576 c = *fmt++;
8577 if (c == '*') {
8578 v = getnextarg(args, arglen, &argidx);
8579 if (v == NULL)
8580 goto onError;
8581 if (!PyInt_Check(v)) {
8582 PyErr_SetString(PyExc_TypeError,
8583 "* wants int");
8584 goto onError;
8585 }
8586 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008587 if (prec == -1 && PyErr_Occurred())
8588 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 if (prec < 0)
8590 prec = 0;
8591 if (--fmtcnt >= 0)
8592 c = *fmt++;
8593 }
8594 else if (c >= '0' && c <= '9') {
8595 prec = c - '0';
8596 while (--fmtcnt >= 0) {
8597 c = Py_CHARMASK(*fmt++);
8598 if (c < '0' || c > '9')
8599 break;
8600 if ((prec*10) / 10 != prec) {
8601 PyErr_SetString(PyExc_ValueError,
8602 "prec too big");
8603 goto onError;
8604 }
8605 prec = prec*10 + (c - '0');
8606 }
8607 }
8608 } /* prec */
8609 if (fmtcnt >= 0) {
8610 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 if (--fmtcnt >= 0)
8612 c = *fmt++;
8613 }
8614 }
8615 if (fmtcnt < 0) {
8616 PyErr_SetString(PyExc_ValueError,
8617 "incomplete format");
8618 goto onError;
8619 }
8620 if (c != '%') {
8621 v = getnextarg(args, arglen, &argidx);
8622 if (v == NULL)
8623 goto onError;
8624 }
8625 sign = 0;
8626 fill = ' ';
8627 switch (c) {
8628
8629 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008630 pbuf = formatbuf;
8631 /* presume that buffer length is at least 1 */
8632 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 len = 1;
8634 break;
8635
8636 case 's':
8637 case 'r':
8638 if (PyUnicode_Check(v) && c == 's') {
8639 temp = v;
8640 Py_INCREF(temp);
8641 }
8642 else {
8643 PyObject *unicode;
8644 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008645 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 else
8647 temp = PyObject_Repr(v);
8648 if (temp == NULL)
8649 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008650 if (PyUnicode_Check(temp))
8651 /* nothing to do */;
8652 else if (PyString_Check(temp)) {
8653 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008654 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008656 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008658 Py_DECREF(temp);
8659 temp = unicode;
8660 if (temp == NULL)
8661 goto onError;
8662 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008663 else {
8664 Py_DECREF(temp);
8665 PyErr_SetString(PyExc_TypeError,
8666 "%s argument has non-string str()");
8667 goto onError;
8668 }
8669 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008670 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 len = PyUnicode_GET_SIZE(temp);
8672 if (prec >= 0 && len > prec)
8673 len = prec;
8674 break;
8675
8676 case 'i':
8677 case 'd':
8678 case 'u':
8679 case 'o':
8680 case 'x':
8681 case 'X':
8682 if (c == 'i')
8683 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008684 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008685 temp = formatlong(v, flags, prec, c);
8686 if (!temp)
8687 goto onError;
8688 pbuf = PyUnicode_AS_UNICODE(temp);
8689 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008690 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008692 else {
8693 pbuf = formatbuf;
8694 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8695 flags, prec, c, v);
8696 if (len < 0)
8697 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008698 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008699 }
8700 if (flags & F_ZERO)
8701 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 break;
8703
8704 case 'e':
8705 case 'E':
8706 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008707 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 case 'g':
8709 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008710 if (c == 'F')
8711 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008712 pbuf = formatbuf;
8713 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8714 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715 if (len < 0)
8716 goto onError;
8717 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008718 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719 fill = '0';
8720 break;
8721
8722 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008723 pbuf = formatbuf;
8724 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 if (len < 0)
8726 goto onError;
8727 break;
8728
8729 default:
8730 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008731 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008732 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008733 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008734 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008735 (Py_ssize_t)(fmt - 1 -
8736 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 goto onError;
8738 }
8739 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008740 if (*pbuf == '-' || *pbuf == '+') {
8741 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742 len--;
8743 }
8744 else if (flags & F_SIGN)
8745 sign = '+';
8746 else if (flags & F_BLANK)
8747 sign = ' ';
8748 else
8749 sign = 0;
8750 }
8751 if (width < len)
8752 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008753 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 reslen -= rescnt;
8755 rescnt = width + fmtcnt + 100;
8756 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008757 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008758 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008759 PyErr_NoMemory();
8760 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008761 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008762 if (_PyUnicode_Resize(&result, reslen) < 0) {
8763 Py_XDECREF(temp);
8764 goto onError;
8765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 res = PyUnicode_AS_UNICODE(result)
8767 + reslen - rescnt;
8768 }
8769 if (sign) {
8770 if (fill != ' ')
8771 *res++ = sign;
8772 rescnt--;
8773 if (width > len)
8774 width--;
8775 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008776 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008777 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008778 assert(pbuf[1] == c);
8779 if (fill != ' ') {
8780 *res++ = *pbuf++;
8781 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008782 }
Tim Petersfff53252001-04-12 18:38:48 +00008783 rescnt -= 2;
8784 width -= 2;
8785 if (width < 0)
8786 width = 0;
8787 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 if (width > len && !(flags & F_LJUST)) {
8790 do {
8791 --rescnt;
8792 *res++ = fill;
8793 } while (--width > len);
8794 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008795 if (fill == ' ') {
8796 if (sign)
8797 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008798 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008799 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008800 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008801 *res++ = *pbuf++;
8802 *res++ = *pbuf++;
8803 }
8804 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008805 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806 res += len;
8807 rescnt -= len;
8808 while (--width >= len) {
8809 --rescnt;
8810 *res++ = ' ';
8811 }
8812 if (dict && (argidx < arglen) && c != '%') {
8813 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008814 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008815 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 goto onError;
8817 }
8818 Py_XDECREF(temp);
8819 } /* '%' */
8820 } /* until end */
8821 if (argidx < arglen && !dict) {
8822 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008823 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824 goto onError;
8825 }
8826
Thomas Woutersa96affe2006-03-12 00:29:36 +00008827 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 if (args_owned) {
8830 Py_DECREF(args);
8831 }
8832 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833 return (PyObject *)result;
8834
8835 onError:
8836 Py_XDECREF(result);
8837 Py_DECREF(uformat);
8838 if (args_owned) {
8839 Py_DECREF(args);
8840 }
8841 return NULL;
8842}
8843
8844static PyBufferProcs unicode_as_buffer = {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008845 (getbufferproc) unicode_buffer_getbuffer,
8846 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847};
8848
Jeremy Hylton938ace62002-07-17 16:30:39 +00008849static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008850unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8851
Tim Peters6d6c1a32001-08-02 04:15:00 +00008852static PyObject *
8853unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8854{
8855 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008856 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008857 char *encoding = NULL;
8858 char *errors = NULL;
8859
Guido van Rossume023fe02001-08-30 03:12:59 +00008860 if (type != &PyUnicode_Type)
8861 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008862 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8863 kwlist, &x, &encoding, &errors))
8864 return NULL;
8865 if (x == NULL)
8866 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008867 if (encoding == NULL && errors == NULL)
8868 return PyObject_Unicode(x);
8869 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008870 return PyUnicode_FromEncodedObject(x, encoding, errors);
8871}
8872
Guido van Rossume023fe02001-08-30 03:12:59 +00008873static PyObject *
8874unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8875{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008876 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008877 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008878
8879 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8880 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8881 if (tmp == NULL)
8882 return NULL;
8883 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008884 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008885 if (pnew == NULL) {
8886 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008887 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008888 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008889 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8890 if (pnew->str == NULL) {
8891 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008892 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008893 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008894 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008895 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008896 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8897 pnew->length = n;
8898 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008899 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008900 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008901}
8902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008903PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008904"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008905\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008906Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008907encoding defaults to the current default string encoding.\n\
8908errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008909
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008910static PyObject *unicode_iter(PyObject *seq);
8911
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008913 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008914 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915 sizeof(PyUnicodeObject), /* tp_size */
8916 0, /* tp_itemsize */
8917 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008918 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008920 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008922 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008923 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008924 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008926 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927 (hashfunc) unicode_hash, /* tp_hash*/
8928 0, /* tp_call*/
8929 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008930 PyObject_GenericGetAttr, /* tp_getattro */
8931 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008933 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8934 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008935 unicode_doc, /* tp_doc */
8936 0, /* tp_traverse */
8937 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008938 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008939 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008940 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008941 0, /* tp_iternext */
8942 unicode_methods, /* tp_methods */
8943 0, /* tp_members */
8944 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008945 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008946 0, /* tp_dict */
8947 0, /* tp_descr_get */
8948 0, /* tp_descr_set */
8949 0, /* tp_dictoffset */
8950 0, /* tp_init */
8951 0, /* tp_alloc */
8952 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008953 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954};
8955
8956/* Initialize the Unicode implementation */
8957
Thomas Wouters78890102000-07-22 19:25:51 +00008958void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008960 int i;
8961
Thomas Wouters477c8d52006-05-27 19:21:47 +00008962 /* XXX - move this array to unicodectype.c ? */
8963 Py_UNICODE linebreak[] = {
8964 0x000A, /* LINE FEED */
8965 0x000D, /* CARRIAGE RETURN */
8966 0x001C, /* FILE SEPARATOR */
8967 0x001D, /* GROUP SEPARATOR */
8968 0x001E, /* RECORD SEPARATOR */
8969 0x0085, /* NEXT LINE */
8970 0x2028, /* LINE SEPARATOR */
8971 0x2029, /* PARAGRAPH SEPARATOR */
8972 };
8973
Fred Drakee4315f52000-05-09 19:53:39 +00008974 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008975 unicode_freelist = NULL;
8976 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008978 if (!unicode_empty)
8979 return;
8980
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008981 for (i = 0; i < 256; i++)
8982 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008983 if (PyType_Ready(&PyUnicode_Type) < 0)
8984 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008985
8986 /* initialize the linebreak bloom filter */
8987 bloom_linebreak = make_bloom_mask(
8988 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8989 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008990
8991 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992}
8993
8994/* Finalize the Unicode implementation */
8995
8996void
Thomas Wouters78890102000-07-22 19:25:51 +00008997_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008999 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009000 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009002 Py_XDECREF(unicode_empty);
9003 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009004
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009005 for (i = 0; i < 256; i++) {
9006 if (unicode_latin1[i]) {
9007 Py_DECREF(unicode_latin1[i]);
9008 unicode_latin1[i] = NULL;
9009 }
9010 }
9011
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009012 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013 PyUnicodeObject *v = u;
9014 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009015 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009016 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009017 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009018 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009020 unicode_freelist = NULL;
9021 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009023
Walter Dörwald16807132007-05-25 13:52:07 +00009024void
9025PyUnicode_InternInPlace(PyObject **p)
9026{
9027 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9028 PyObject *t;
9029 if (s == NULL || !PyUnicode_Check(s))
9030 Py_FatalError(
9031 "PyUnicode_InternInPlace: unicode strings only please!");
9032 /* If it's a subclass, we don't really know what putting
9033 it in the interned dict might do. */
9034 if (!PyUnicode_CheckExact(s))
9035 return;
9036 if (PyUnicode_CHECK_INTERNED(s))
9037 return;
9038 if (interned == NULL) {
9039 interned = PyDict_New();
9040 if (interned == NULL) {
9041 PyErr_Clear(); /* Don't leave an exception */
9042 return;
9043 }
9044 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009045 /* It might be that the GetItem call fails even
9046 though the key is present in the dictionary,
9047 namely when this happens during a stack overflow. */
9048 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009049 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009050 Py_END_ALLOW_RECURSION
9051
Walter Dörwald16807132007-05-25 13:52:07 +00009052 if (t) {
9053 Py_INCREF(t);
9054 Py_DECREF(*p);
9055 *p = t;
9056 return;
9057 }
9058
Martin v. Löwis5b222132007-06-10 09:51:05 +00009059 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009060 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9061 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009062 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009063 return;
9064 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009065 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009066 /* The two references in interned are not counted by refcnt.
9067 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009068 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009069 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9070}
9071
9072void
9073PyUnicode_InternImmortal(PyObject **p)
9074{
9075 PyUnicode_InternInPlace(p);
9076 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9077 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9078 Py_INCREF(*p);
9079 }
9080}
9081
9082PyObject *
9083PyUnicode_InternFromString(const char *cp)
9084{
9085 PyObject *s = PyUnicode_FromString(cp);
9086 if (s == NULL)
9087 return NULL;
9088 PyUnicode_InternInPlace(&s);
9089 return s;
9090}
9091
9092void _Py_ReleaseInternedUnicodeStrings(void)
9093{
9094 PyObject *keys;
9095 PyUnicodeObject *s;
9096 Py_ssize_t i, n;
9097 Py_ssize_t immortal_size = 0, mortal_size = 0;
9098
9099 if (interned == NULL || !PyDict_Check(interned))
9100 return;
9101 keys = PyDict_Keys(interned);
9102 if (keys == NULL || !PyList_Check(keys)) {
9103 PyErr_Clear();
9104 return;
9105 }
9106
9107 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9108 detector, interned unicode strings are not forcibly deallocated;
9109 rather, we give them their stolen references back, and then clear
9110 and DECREF the interned dict. */
9111
9112 n = PyList_GET_SIZE(keys);
9113 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9114 n);
9115 for (i = 0; i < n; i++) {
9116 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9117 switch (s->state) {
9118 case SSTATE_NOT_INTERNED:
9119 /* XXX Shouldn't happen */
9120 break;
9121 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009122 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009123 immortal_size += s->length;
9124 break;
9125 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009126 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009127 mortal_size += s->length;
9128 break;
9129 default:
9130 Py_FatalError("Inconsistent interned string state.");
9131 }
9132 s->state = SSTATE_NOT_INTERNED;
9133 }
9134 fprintf(stderr, "total size of all interned strings: "
9135 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9136 "mortal/immortal\n", mortal_size, immortal_size);
9137 Py_DECREF(keys);
9138 PyDict_Clear(interned);
9139 Py_DECREF(interned);
9140 interned = NULL;
9141}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009142
9143
Eric Smith8c663262007-08-25 02:26:07 +00009144/********************* Formatter Iterator ************************/
9145
9146/* this is used to implement string.Formatter.vparse(). it exists so
9147 Formatter can share code with the built in unicode.format()
9148 method */
9149
9150typedef struct {
9151 PyObject_HEAD
9152
9153 /* we know this to be a unicode object, but since we just keep
9154 it around to keep the object alive, having it as PyObject
9155 is okay */
9156 PyObject *str;
9157
9158 MarkupIterator it_markup;
9159} formatteriterobject;
9160
9161static void
9162formatteriter_dealloc(formatteriterobject *it)
9163{
9164 _PyObject_GC_UNTRACK(it);
9165 Py_XDECREF(it->str);
9166 PyObject_GC_Del(it);
9167}
9168
9169/* returns a tuple:
9170 (is_markup, literal, field_name, format_spec, conversion)
9171 if is_markup == True:
9172 literal is None
9173 field_name is the string before the ':'
9174 format_spec is the string after the ':'
9175 conversion is either None, or the string after the '!'
9176 if is_markup == False:
9177 literal is the literal string
9178 field_name is None
9179 format_spec is None
9180 conversion is None
9181*/
9182static PyObject *
9183formatteriter_next(formatteriterobject *it)
9184{
9185 SubString literal;
9186 SubString field_name;
9187 SubString format_spec;
9188 Py_UNICODE conversion;
9189 int is_markup;
9190 int format_spec_needs_expanding;
9191 int result = MarkupIterator_next(&it->it_markup, &is_markup, &literal,
9192 &field_name, &format_spec, &conversion,
9193 &format_spec_needs_expanding);
9194
9195 /* all of the SubString objects point into it->str, so no
9196 memory management needs to be done on them */
9197
9198 if (result == 0) {
9199 /* error has already been set */
9200 return NULL;
9201 } else if (result == 1) {
9202 /* end of iterator */
9203 return NULL;
9204 } else {
9205 PyObject *is_markup_bool = NULL;
9206 PyObject *literal_str = NULL;
9207 PyObject *field_name_str = NULL;
9208 PyObject *format_spec_str = NULL;
9209 PyObject *conversion_str = NULL;
9210 PyObject *result = NULL;
9211
9212 assert(result == 2);
9213
9214 is_markup_bool = PyBool_FromLong(is_markup);
9215 if (!is_markup_bool)
9216 goto error;
9217
9218 if (is_markup) {
9219 /* field_name, format_spec, and conversion are
9220 returned */
9221 literal_str = Py_None;
9222 Py_INCREF(literal_str);
9223
9224 field_name_str = SubString_new_object(&field_name);
9225 if (field_name_str == NULL)
9226 goto error;
9227
9228 format_spec_str = SubString_new_object(&format_spec);
9229 if (format_spec_str == NULL)
9230 goto error;
9231
9232 /* if the conversion is not specified, return
9233 a None, otherwise create a one length
9234 string with the conversion characater */
9235 if (conversion == '\0') {
9236 conversion_str = Py_None;
9237 Py_INCREF(conversion_str);
9238 } else
9239 conversion_str = PyUnicode_FromUnicode(&conversion,
9240 1);
9241 if (conversion_str == NULL)
9242 goto error;
9243 } else {
9244 /* only literal is returned */
9245 literal_str = SubString_new_object(&literal);
9246 if (literal_str == NULL)
9247 goto error;
9248
9249 field_name_str = Py_None;
9250 format_spec_str = Py_None;
9251 conversion_str = Py_None;
9252
9253 Py_INCREF(field_name_str);
9254 Py_INCREF(format_spec_str);
9255 Py_INCREF(conversion_str);
9256 }
9257 /* return a tuple of values */
9258 result = PyTuple_Pack(5, is_markup_bool, literal_str,
9259 field_name_str, format_spec_str,
9260 conversion_str);
9261 if (result == NULL)
9262 goto error;
9263
9264 return result;
9265 error:
9266 Py_XDECREF(is_markup_bool);
9267 Py_XDECREF(literal_str);
9268 Py_XDECREF(field_name_str);
9269 Py_XDECREF(format_spec_str);
9270 Py_XDECREF(conversion_str);
9271 Py_XDECREF(result);
9272 return NULL;
9273 }
9274}
9275
9276static PyMethodDef formatteriter_methods[] = {
9277 {NULL, NULL} /* sentinel */
9278};
9279
9280PyTypeObject PyFormatterIter_Type = {
9281 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9282 "formatteriterator", /* tp_name */
9283 sizeof(formatteriterobject), /* tp_basicsize */
9284 0, /* tp_itemsize */
9285 /* methods */
9286 (destructor)formatteriter_dealloc, /* tp_dealloc */
9287 0, /* tp_print */
9288 0, /* tp_getattr */
9289 0, /* tp_setattr */
9290 0, /* tp_compare */
9291 0, /* tp_repr */
9292 0, /* tp_as_number */
9293 0, /* tp_as_sequence */
9294 0, /* tp_as_mapping */
9295 0, /* tp_hash */
9296 0, /* tp_call */
9297 0, /* tp_str */
9298 PyObject_GenericGetAttr, /* tp_getattro */
9299 0, /* tp_setattro */
9300 0, /* tp_as_buffer */
9301 Py_TPFLAGS_DEFAULT, /* tp_flags */
9302 0, /* tp_doc */
9303 0, /* tp_traverse */
9304 0, /* tp_clear */
9305 0, /* tp_richcompare */
9306 0, /* tp_weaklistoffset */
9307 PyObject_SelfIter, /* tp_iter */
9308 (iternextfunc)formatteriter_next, /* tp_iternext */
9309 formatteriter_methods, /* tp_methods */
9310 0,
9311};
9312
9313PyObject *
9314_unicodeformatter_iterator(PyObject *str)
9315{
9316 formatteriterobject *it;
9317
9318 it = PyObject_GC_New(formatteriterobject, &PyFormatterIter_Type);
9319 if (it == NULL)
9320 return NULL;
9321
9322 /* take ownership, give the object to the iterator */
9323 Py_INCREF(str);
9324 it->str = str;
9325
9326 /* initialize the contained MarkupIterator */
9327 MarkupIterator_init(&it->it_markup,
9328 PyUnicode_AS_UNICODE(str),
9329 PyUnicode_GET_SIZE(str));
9330
9331 _PyObject_GC_TRACK(it);
9332 return (PyObject *)it;
9333}
9334
9335PyObject *
9336_unicodeformatter_lookup(PyObject *field_name, PyObject *args,
9337 PyObject *kwargs)
9338{
9339 return NULL;
9340}
9341
9342
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009343/********************* Unicode Iterator **************************/
9344
9345typedef struct {
9346 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009347 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009348 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9349} unicodeiterobject;
9350
9351static void
9352unicodeiter_dealloc(unicodeiterobject *it)
9353{
9354 _PyObject_GC_UNTRACK(it);
9355 Py_XDECREF(it->it_seq);
9356 PyObject_GC_Del(it);
9357}
9358
9359static int
9360unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9361{
9362 Py_VISIT(it->it_seq);
9363 return 0;
9364}
9365
9366static PyObject *
9367unicodeiter_next(unicodeiterobject *it)
9368{
9369 PyUnicodeObject *seq;
9370 PyObject *item;
9371
9372 assert(it != NULL);
9373 seq = it->it_seq;
9374 if (seq == NULL)
9375 return NULL;
9376 assert(PyUnicode_Check(seq));
9377
9378 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009379 item = PyUnicode_FromUnicode(
9380 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009381 if (item != NULL)
9382 ++it->it_index;
9383 return item;
9384 }
9385
9386 Py_DECREF(seq);
9387 it->it_seq = NULL;
9388 return NULL;
9389}
9390
9391static PyObject *
9392unicodeiter_len(unicodeiterobject *it)
9393{
9394 Py_ssize_t len = 0;
9395 if (it->it_seq)
9396 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9397 return PyInt_FromSsize_t(len);
9398}
9399
9400PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9401
9402static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009403 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9404 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009405 {NULL, NULL} /* sentinel */
9406};
9407
9408PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009409 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009410 "unicodeiterator", /* tp_name */
9411 sizeof(unicodeiterobject), /* tp_basicsize */
9412 0, /* tp_itemsize */
9413 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009414 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009415 0, /* tp_print */
9416 0, /* tp_getattr */
9417 0, /* tp_setattr */
9418 0, /* tp_compare */
9419 0, /* tp_repr */
9420 0, /* tp_as_number */
9421 0, /* tp_as_sequence */
9422 0, /* tp_as_mapping */
9423 0, /* tp_hash */
9424 0, /* tp_call */
9425 0, /* tp_str */
9426 PyObject_GenericGetAttr, /* tp_getattro */
9427 0, /* tp_setattro */
9428 0, /* tp_as_buffer */
9429 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9430 0, /* tp_doc */
9431 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9432 0, /* tp_clear */
9433 0, /* tp_richcompare */
9434 0, /* tp_weaklistoffset */
9435 PyObject_SelfIter, /* tp_iter */
9436 (iternextfunc)unicodeiter_next, /* tp_iternext */
9437 unicodeiter_methods, /* tp_methods */
9438 0,
9439};
9440
9441static PyObject *
9442unicode_iter(PyObject *seq)
9443{
9444 unicodeiterobject *it;
9445
9446 if (!PyUnicode_Check(seq)) {
9447 PyErr_BadInternalCall();
9448 return NULL;
9449 }
9450 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9451 if (it == NULL)
9452 return NULL;
9453 it->it_index = 0;
9454 Py_INCREF(seq);
9455 it->it_seq = (PyUnicodeObject *)seq;
9456 _PyObject_GC_TRACK(it);
9457 return (PyObject *)it;
9458}
9459
Martin v. Löwis5b222132007-06-10 09:51:05 +00009460size_t
9461Py_UNICODE_strlen(const Py_UNICODE *u)
9462{
9463 int res = 0;
9464 while(*u++)
9465 res++;
9466 return res;
9467}
9468
9469Py_UNICODE*
9470Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9471{
9472 Py_UNICODE *u = s1;
9473 while ((*u++ = *s2++));
9474 return s1;
9475}
9476
9477Py_UNICODE*
9478Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9479{
9480 Py_UNICODE *u = s1;
9481 while ((*u++ = *s2++))
9482 if (n-- == 0)
9483 break;
9484 return s1;
9485}
9486
9487int
9488Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9489{
9490 while (*s1 && *s2 && *s1 == *s2)
9491 s1++, s2++;
9492 if (*s1 && *s2)
9493 return (*s1 < *s2) ? -1 : +1;
9494 if (*s1)
9495 return 1;
9496 if (*s2)
9497 return -1;
9498 return 0;
9499}
9500
9501Py_UNICODE*
9502Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9503{
9504 const Py_UNICODE *p;
9505 for (p = s; *p; p++)
9506 if (*p == c)
9507 return (Py_UNICODE*)p;
9508 return NULL;
9509}
9510
9511
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009512#ifdef __cplusplus
9513}
9514#endif
9515
9516
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009517/*
9518Local variables:
9519c-basic-offset: 4
9520indent-tabs-mode: nil
9521End:
9522*/