blob: 24b2bf384eb959cd088f057651fe9db89e707e3d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
120 PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000122
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000124PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000126#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127 return 0x10FFFF;
128#else
129 /* This is actually an illegal character, so it should
130 not be passed to unichr. */
131 return 0xFFFF;
132#endif
133}
134
Thomas Wouters477c8d52006-05-27 19:21:47 +0000135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138 to keep things simple, we use a single bitmask, using the least 5
139 bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154 /* calculate simple bloom-style bitmask for a given unicode string */
155
156 long mask;
157 Py_ssize_t i;
158
159 mask = 0;
160 for (i = 0; i < len; i++)
161 mask |= (1 << (ptr[i] & 0x1F));
162
163 return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168 Py_ssize_t i;
169
170 for (i = 0; i < setlen; i++)
171 if (set[i] == chr)
172 return 1;
173
174 return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
Guido van Rossumd57fd912000-03-10 22:53:23 +0000180/* --- Unicode Object ----------------------------------------------------- */
181
182static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000184 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185{
186 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000187
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000191
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192 /* Resizing shared object (unicode_empty or single character
193 objects) in-place is not allowed. Use PyUnicode_Resize()
194 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000195
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 /* We allocate one more byte to make sure the string is Ux0000 terminated.
206 The overallocation is also used by fastsearch, which assumes that it's
207 safe to look at str[length] (without making any assumptions about what
208 it contains). */
209
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 oldstr = unicode->str;
211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 PyErr_NoMemory();
215 return -1;
216 }
217 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000222 if (unicode->defenc) {
223 Py_DECREF(unicode->defenc);
224 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000227
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 return 0;
229}
230
231/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000232 Ux0000 terminated; some code (e.g. new_identifier)
233 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234
235 XXX This allocator could further be enhanced by assuring that the
236 free list never reduces its size below 1.
237
238*/
239
240static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
243 register PyUnicodeObject *unicode;
244
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 if (length == 0 && unicode_empty != NULL) {
247 Py_INCREF(unicode_empty);
248 return unicode_empty;
249 }
250
251 /* Unicode freelist & memory allocation */
252 if (unicode_freelist) {
253 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000254 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 /* Keep-Alive optimization: we only upsize the buffer,
258 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000259 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000260 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000267 }
268 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 }
270 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 if (unicode == NULL)
273 return NULL;
274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000292 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000293 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000295
296 onError:
297 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000298 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300}
301
302static
Guido van Rossum9475a232001-10-05 20:51:39 +0000303void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304{
Walter Dörwald16807132007-05-25 13:52:07 +0000305 switch (PyUnicode_CHECK_INTERNED(unicode)) {
306 case SSTATE_NOT_INTERNED:
307 break;
308
309 case SSTATE_INTERNED_MORTAL:
310 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000311 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313 Py_FatalError(
314 "deletion of interned unicode string failed");
315 break;
316
317 case SSTATE_INTERNED_IMMORTAL:
318 Py_FatalError("Immortal interned unicode string died.");
319
320 default:
321 Py_FatalError("Inconsistent interned unicode string state.");
322 }
323
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 if (PyUnicode_CheckExact(unicode) &&
325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000326 /* Keep-Alive optimization */
327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000328 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 unicode->str = NULL;
330 unicode->length = 0;
331 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000332 if (unicode->defenc) {
333 Py_DECREF(unicode->defenc);
334 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000335 }
336 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 *(PyUnicodeObject **)unicode = unicode_freelist;
338 unicode_freelist = unicode;
339 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 }
341 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000342 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000343 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000344 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346}
347
Martin v. Löwis18e16552006-02-15 17:27:45 +0000348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349{
350 register PyUnicodeObject *v;
351
352 /* Argument checks */
353 if (unicode == NULL) {
354 PyErr_BadInternalCall();
355 return -1;
356 }
357 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 PyErr_BadInternalCall();
360 return -1;
361 }
362
363 /* Resizing unicode_empty and single character objects is not
364 possible since these are being shared. We simply return a fresh
365 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000366 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 (v == unicode_empty || v->length == 1)) {
368 PyUnicodeObject *w = _PyUnicode_New(length);
369 if (w == NULL)
370 return -1;
371 Py_UNICODE_COPY(w->str, v->str,
372 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000373 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000374 *unicode = (PyObject *)w;
375 return 0;
376 }
377
378 /* Note that we don't have to modify *unicode for unshared Unicode
379 objects, since we can modify them in-place. */
380 return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
390 PyUnicodeObject *unicode;
391
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000392 /* If the Unicode data is known at construction time, we can apply
393 some optimizations which share commonly used objects. */
394 if (u != NULL) {
395
396 /* Optimization for empty strings */
397 if (size == 0 && unicode_empty != NULL) {
398 Py_INCREF(unicode_empty);
399 return (PyObject *)unicode_empty;
400 }
401
402 /* Single character Unicode objects in the Latin-1 range are
403 shared when using this constructor */
404 if (size == 1 && *u < 256) {
405 unicode = unicode_latin1[*u];
406 if (!unicode) {
407 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 if (!unicode)
409 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000410 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 unicode_latin1[*u] = unicode;
412 }
413 Py_INCREF(unicode);
414 return (PyObject *)unicode;
415 }
416 }
Tim Petersced69f82003-09-16 20:30:58 +0000417
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 unicode = _PyUnicode_New(size);
419 if (!unicode)
420 return NULL;
421
422 /* Copy the Unicode data into the new object */
423 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425
426 return (PyObject *)unicode;
427}
428
Walter Dörwaldd2034312007-05-18 16:29:38 +0000429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430{
431 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000433 some optimizations which share commonly used objects.
434 Also, this means the input must be UTF-8, so fall back to the
435 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 if (u != NULL) {
437
438 /* Optimization for empty strings */
439 if (size == 0 && unicode_empty != NULL) {
440 Py_INCREF(unicode_empty);
441 return (PyObject *)unicode_empty;
442 }
443
Martin v. Löwis9c121062007-08-05 20:26:11 +0000444 /* Single characters are shared when using this constructor.
445 Restrict to ASCII, since the input must be UTF-8. */
446 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000447 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 if (!unicode) {
449 unicode = _PyUnicode_New(1);
450 if (!unicode)
451 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode->str[0] = Py_CHARMASK(*u);
453 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000454 }
455 Py_INCREF(unicode);
456 return (PyObject *)unicode;
457 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000458
459 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 }
461
Walter Dörwald55507312007-05-18 13:12:10 +0000462 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 if (!unicode)
464 return NULL;
465
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000466 return (PyObject *)unicode;
467}
468
Walter Dörwaldd2034312007-05-18 16:29:38 +0000469PyObject *PyUnicode_FromString(const char *u)
470{
471 size_t size = strlen(u);
472 if (size > PY_SSIZE_T_MAX) {
473 PyErr_SetString(PyExc_OverflowError, "input too long");
474 return NULL;
475 }
476
477 return PyUnicode_FromStringAndSize(u, size);
478}
479
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000483 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484{
485 PyUnicodeObject *unicode;
486
487 if (w == NULL) {
488 PyErr_BadInternalCall();
489 return NULL;
490 }
491
492 unicode = _PyUnicode_New(size);
493 if (!unicode)
494 return NULL;
495
496 /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000499#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 {
501 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000504 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 *u++ = *w++;
506 }
507#endif
508
509 return (PyObject *)unicode;
510}
511
Walter Dörwald346737f2007-05-31 10:44:43 +0000512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515 *fmt++ = '%';
516 if (width) {
517 if (zeropad)
518 *fmt++ = '0';
519 fmt += sprintf(fmt, "%d", width);
520 }
521 if (precision)
522 fmt += sprintf(fmt, ".%d", precision);
523 if (longflag)
524 *fmt++ = 'l';
525 else if (size_tflag) {
526 char *f = PY_FORMAT_SIZE_T;
527 while (*f)
528 *fmt++ = *f++;
529 }
530 *fmt++ = c;
531 *fmt = '\0';
532}
533
Walter Dörwaldd2034312007-05-18 16:29:38 +0000534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000540 Py_ssize_t callcount = 0;
541 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000542 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000543 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000544 int width = 0;
545 int precision = 0;
546 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 const char* f;
548 Py_UNICODE *s;
549 PyObject *string;
550 /* used by sprintf */
551 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000552 /* use abuffer instead of buffer, if we need more space
553 * (which can happen if there's a format specifier with width). */
554 char *abuffer = NULL;
555 char *realbuffer;
556 Py_ssize_t abuffersize = 0;
557 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000558 const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561 Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef __va_copy
564 __va_copy(count, vargs);
565#else
566 count = vargs;
567#endif
568#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000569 /* step 1: count the number of %S/%R format specifications
570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000574 ++callcount;
575 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 /* step 2: allocate memory for the results of
577 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 if (callcount) {
579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580 if (!callresults) {
581 PyErr_NoMemory();
582 return NULL;
583 }
584 callresult = callresults;
585 }
586 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000587 for (f = format; *f; f++) {
588 if (*f == '%') {
589 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000590 width = 0;
591 while (isdigit(Py_CHARMASK(*f)))
592 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594 ;
595
596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597 * they don't affect the amount of space we reserve.
598 */
599 if ((*f == 'l' || *f == 'z') &&
600 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000601 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000602
603 switch (*f) {
604 case 'c':
605 (void)va_arg(count, int);
606 /* fall through... */
607 case '%':
608 n++;
609 break;
610 case 'd': case 'u': case 'i': case 'x':
611 (void) va_arg(count, int);
612 /* 20 bytes is enough to hold a 64-bit
613 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000614 This isn't enough for octal.
615 If a width is specified we need more
616 (which we allocate later). */
617 if (width < 20)
618 width = 20;
619 n += width;
620 if (abuffersize < width)
621 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 break;
623 case 's':
624 n += strlen(va_arg(count, char*));
625 break;
626 case 'U':
627 {
628 PyObject *obj = va_arg(count, PyObject *);
629 assert(obj && PyUnicode_Check(obj));
630 n += PyUnicode_GET_SIZE(obj);
631 break;
632 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000633 case 'V':
634 {
635 PyObject *obj = va_arg(count, PyObject *);
636 const char *str = va_arg(count, const char *);
637 assert(obj || str);
638 assert(!obj || PyUnicode_Check(obj));
639 if (obj)
640 n += PyUnicode_GET_SIZE(obj);
641 else
642 n += strlen(str);
643 break;
644 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000645 case 'S':
646 {
647 PyObject *obj = va_arg(count, PyObject *);
648 PyObject *str;
649 assert(obj);
650 str = PyObject_Unicode(obj);
651 if (!str)
652 goto fail;
653 n += PyUnicode_GET_SIZE(str);
654 /* Remember the str and switch to the next slot */
655 *callresult++ = str;
656 break;
657 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000658 case 'R':
659 {
660 PyObject *obj = va_arg(count, PyObject *);
661 PyObject *repr;
662 assert(obj);
663 repr = PyObject_Repr(obj);
664 if (!repr)
665 goto fail;
666 n += PyUnicode_GET_SIZE(repr);
667 /* Remember the repr and switch to the next slot */
668 *callresult++ = repr;
669 break;
670 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000671 case 'p':
672 (void) va_arg(count, int);
673 /* maximum 64-bit pointer representation:
674 * 0xffffffffffffffff
675 * so 19 characters is enough.
676 * XXX I count 18 -- what's the extra for?
677 */
678 n += 19;
679 break;
680 default:
681 /* if we stumble upon an unknown
682 formatting code, copy the rest of
683 the format string to the output
684 string. (we cannot just skip the
685 code, since there's no way to know
686 what's in the argument list) */
687 n += strlen(p);
688 goto expand;
689 }
690 } else
691 n++;
692 }
693 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000694 if (abuffersize > 20) {
695 abuffer = PyMem_Malloc(abuffersize);
696 if (!abuffer) {
697 PyErr_NoMemory();
698 goto fail;
699 }
700 realbuffer = abuffer;
701 }
702 else
703 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000706 we don't have to resize the string.
707 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708 string = PyUnicode_FromUnicode(NULL, n);
709 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000710 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000713 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714
715 for (f = format; *f; f++) {
716 if (*f == '%') {
717 const char* p = f++;
718 int longflag = 0;
719 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000720 zeropad = (*f == '0');
721 /* parse the width.precision part */
722 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000724 width = (width*10) + *f++ - '0';
725 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 if (*f == '.') {
727 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000729 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000730 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 /* handle the long flag, but only for %ld and %lu.
732 others can be added when necessary. */
733 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
734 longflag = 1;
735 ++f;
736 }
737 /* handle the size_t flag. */
738 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
739 size_tflag = 1;
740 ++f;
741 }
742
743 switch (*f) {
744 case 'c':
745 *s++ = va_arg(vargs, int);
746 break;
747 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000754 sprintf(realbuffer, fmt, va_arg(vargs, int));
755 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 break;
757 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000764 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
765 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 break;
767 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000768 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
769 sprintf(realbuffer, fmt, va_arg(vargs, int));
770 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000771 break;
772 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000773 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
774 sprintf(realbuffer, fmt, va_arg(vargs, int));
775 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000776 break;
777 case 's':
778 p = va_arg(vargs, char*);
779 appendstring(p);
780 break;
781 case 'U':
782 {
783 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000784 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
785 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
786 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 break;
788 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000789 case 'V':
790 {
791 PyObject *obj = va_arg(vargs, PyObject *);
792 const char *str = va_arg(vargs, const char *);
793 if (obj) {
794 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
795 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
796 s += size;
797 } else {
798 appendstring(str);
799 }
800 break;
801 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000802 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000803 case 'R':
804 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000805 Py_UNICODE *ucopy;
806 Py_ssize_t usize;
807 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000808 /* unused, since we already have the result */
809 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000810 ucopy = PyUnicode_AS_UNICODE(*callresult);
811 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 for (upos = 0; upos<usize;)
813 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000816 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000817 ++callresult;
818 break;
819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820 case 'p':
821 sprintf(buffer, "%p", va_arg(vargs, void*));
822 /* %p is ill-defined: ensure leading 0x. */
823 if (buffer[1] == 'X')
824 buffer[1] = 'x';
825 else if (buffer[1] != 'x') {
826 memmove(buffer+2, buffer, strlen(buffer)+1);
827 buffer[0] = '0';
828 buffer[1] = 'x';
829 }
830 appendstring(buffer);
831 break;
832 case '%':
833 *s++ = '%';
834 break;
835 default:
836 appendstring(p);
837 goto end;
838 }
839 } else
840 *s++ = *f;
841 }
842
843 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 if (callresults)
845 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000846 if (abuffer)
847 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
849 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000850 fail:
851 if (callresults) {
852 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000853 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000854 Py_DECREF(*callresult2);
855 ++callresult2;
856 }
857 PyMem_Free(callresults);
858 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 if (abuffer)
860 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862}
863
864#undef appendstring
865
866PyObject *
867PyUnicode_FromFormat(const char *format, ...)
868{
869 PyObject* ret;
870 va_list vargs;
871
872#ifdef HAVE_STDARG_PROTOTYPES
873 va_start(vargs, format);
874#else
875 va_start(vargs);
876#endif
877 ret = PyUnicode_FromFormatV(format, vargs);
878 va_end(vargs);
879 return ret;
880}
881
Martin v. Löwis18e16552006-02-15 17:27:45 +0000882Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
883 wchar_t *w,
884 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
886 if (unicode == NULL) {
887 PyErr_BadInternalCall();
888 return -1;
889 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890
891 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000893 size = PyUnicode_GET_SIZE(unicode) + 1;
894
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895#ifdef HAVE_USABLE_WCHAR_T
896 memcpy(w, unicode->str, size * sizeof(wchar_t));
897#else
898 {
899 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000900 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000902 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 *w++ = *u++;
904 }
905#endif
906
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000907 if (size > PyUnicode_GET_SIZE(unicode))
908 return PyUnicode_GET_SIZE(unicode);
909 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000910 return size;
911}
912
913#endif
914
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915PyObject *PyUnicode_FromOrdinal(int ordinal)
916{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000917 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000918
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 if (ordinal < 0 || ordinal > 0x10ffff) {
920 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000922 return NULL;
923 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000924
925#ifndef Py_UNICODE_WIDE
926 if (ordinal > 0xffff) {
927 ordinal -= 0x10000;
928 s[0] = 0xD800 | (ordinal >> 10);
929 s[1] = 0xDC00 | (ordinal & 0x3FF);
930 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000931 }
932#endif
933
Hye-Shik Chang40574832004-04-06 07:24:51 +0000934 s[0] = (Py_UNICODE)ordinal;
935 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000936}
937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938PyObject *PyUnicode_FromObject(register PyObject *obj)
939{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000940 /* XXX Perhaps we should make this API an alias of
941 PyObject_Unicode() instead ?! */
942 if (PyUnicode_CheckExact(obj)) {
943 Py_INCREF(obj);
944 return obj;
945 }
946 if (PyUnicode_Check(obj)) {
947 /* For a Unicode subtype that's not a Unicode object,
948 return a true Unicode object with the same data. */
949 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
950 PyUnicode_GET_SIZE(obj));
951 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000952 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
953}
954
955PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
956 const char *encoding,
957 const char *errors)
958{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000959 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000961 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000962
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 if (obj == NULL) {
964 PyErr_BadInternalCall();
965 return NULL;
966 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000967
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000968 if (PyUnicode_Check(obj)) {
969 PyErr_SetString(PyExc_TypeError,
970 "decoding Unicode is not supported");
971 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000972 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000973
974 /* Coerce object */
975 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000976 s = PyString_AS_STRING(obj);
977 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000978 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000979 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
980 /* Overwrite the error message with something more useful in
981 case of a TypeError. */
982 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 "coercing to Unicode: need string or buffer, "
985 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000986 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000987 goto onError;
988 }
Tim Petersced69f82003-09-16 20:30:58 +0000989
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000990 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 if (len == 0) {
992 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000993 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 }
Tim Petersced69f82003-09-16 20:30:58 +0000995 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000996 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000997
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000998 return v;
999
1000 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002}
1003
1004PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001005 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001006 const char *encoding,
1007 const char *errors)
1008{
1009 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001010
1011 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001012 encoding = PyUnicode_GetDefaultEncoding();
1013
1014 /* Shortcuts for common default encodings */
1015 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001016 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001017 else if (strcmp(encoding, "latin-1") == 0)
1018 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001019#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1020 else if (strcmp(encoding, "mbcs") == 0)
1021 return PyUnicode_DecodeMBCS(s, size, errors);
1022#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001023 else if (strcmp(encoding, "ascii") == 0)
1024 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025
1026 /* Decode via the codec registry */
1027 buffer = PyBuffer_FromMemory((void *)s, size);
1028 if (buffer == NULL)
1029 goto onError;
1030 unicode = PyCodec_Decode(buffer, encoding, errors);
1031 if (unicode == NULL)
1032 goto onError;
1033 if (!PyUnicode_Check(unicode)) {
1034 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001035 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001036 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 Py_DECREF(unicode);
1038 goto onError;
1039 }
1040 Py_DECREF(buffer);
1041 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001042
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 onError:
1044 Py_XDECREF(buffer);
1045 return NULL;
1046}
1047
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001048PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1049 const char *encoding,
1050 const char *errors)
1051{
1052 PyObject *v;
1053
1054 if (!PyUnicode_Check(unicode)) {
1055 PyErr_BadArgument();
1056 goto onError;
1057 }
1058
1059 if (encoding == NULL)
1060 encoding = PyUnicode_GetDefaultEncoding();
1061
1062 /* Decode via the codec registry */
1063 v = PyCodec_Decode(unicode, encoding, errors);
1064 if (v == NULL)
1065 goto onError;
1066 return v;
1067
1068 onError:
1069 return NULL;
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001073 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 const char *encoding,
1075 const char *errors)
1076{
1077 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 unicode = PyUnicode_FromUnicode(s, size);
1080 if (unicode == NULL)
1081 return NULL;
1082 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1083 Py_DECREF(unicode);
1084 return v;
1085}
1086
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001087PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1088 const char *encoding,
1089 const char *errors)
1090{
1091 PyObject *v;
1092
1093 if (!PyUnicode_Check(unicode)) {
1094 PyErr_BadArgument();
1095 goto onError;
1096 }
1097
1098 if (encoding == NULL)
1099 encoding = PyUnicode_GetDefaultEncoding();
1100
1101 /* Encode via the codec registry */
1102 v = PyCodec_Encode(unicode, encoding, errors);
1103 if (v == NULL)
1104 goto onError;
1105 return v;
1106
1107 onError:
1108 return NULL;
1109}
1110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1112 const char *encoding,
1113 const char *errors)
1114{
1115 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001116
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 if (!PyUnicode_Check(unicode)) {
1118 PyErr_BadArgument();
1119 goto onError;
1120 }
Fred Drakee4315f52000-05-09 19:53:39 +00001121
Tim Petersced69f82003-09-16 20:30:58 +00001122 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001123 encoding = PyUnicode_GetDefaultEncoding();
1124
1125 /* Shortcuts for common default encodings */
1126 if (errors == NULL) {
1127 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001128 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001129 else if (strcmp(encoding, "latin-1") == 0)
1130 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001131#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1132 else if (strcmp(encoding, "mbcs") == 0)
1133 return PyUnicode_AsMBCSString(unicode);
1134#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001135 else if (strcmp(encoding, "ascii") == 0)
1136 return PyUnicode_AsASCIIString(unicode);
1137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139 /* Encode via the codec registry */
1140 v = PyCodec_Encode(unicode, encoding, errors);
1141 if (v == NULL)
1142 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001143 if (!PyBytes_Check(v)) {
1144 if (PyString_Check(v)) {
1145 /* Old codec, turn it into bytes */
1146 PyObject *b = PyBytes_FromObject(v);
1147 Py_DECREF(v);
1148 return b;
1149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001151 "encoder did not return a bytes object "
1152 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1153 v->ob_type->tp_name,
1154 encoding ? encoding : "NULL",
1155 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 Py_DECREF(v);
1157 goto onError;
1158 }
1159 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 onError:
1162 return NULL;
1163}
1164
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001165PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1166 const char *errors)
1167{
1168 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001169 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001170 if (v)
1171 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001172 if (errors != NULL)
1173 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001174 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1175 PyUnicode_GET_SIZE(unicode),
1176 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001177 if (!b)
1178 return NULL;
1179 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1180 PyBytes_Size(b));
1181 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001182 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001183 return v;
1184}
1185
Martin v. Löwis5b222132007-06-10 09:51:05 +00001186char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001187PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001188{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001189 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001190 if (!PyUnicode_Check(unicode)) {
1191 PyErr_BadArgument();
1192 return NULL;
1193 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001194 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1195 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001196 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001197 if (psize != NULL)
1198 *psize = PyString_GET_SIZE(str8);
1199 return PyString_AS_STRING(str8);
1200}
1201
1202char*
1203PyUnicode_AsString(PyObject *unicode)
1204{
1205 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001206}
1207
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1209{
1210 if (!PyUnicode_Check(unicode)) {
1211 PyErr_BadArgument();
1212 goto onError;
1213 }
1214 return PyUnicode_AS_UNICODE(unicode);
1215
1216 onError:
1217 return NULL;
1218}
1219
Martin v. Löwis18e16552006-02-15 17:27:45 +00001220Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221{
1222 if (!PyUnicode_Check(unicode)) {
1223 PyErr_BadArgument();
1224 goto onError;
1225 }
1226 return PyUnicode_GET_SIZE(unicode);
1227
1228 onError:
1229 return -1;
1230}
1231
Thomas Wouters78890102000-07-22 19:25:51 +00001232const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001233{
1234 return unicode_default_encoding;
1235}
1236
1237int PyUnicode_SetDefaultEncoding(const char *encoding)
1238{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001239 if (strcmp(encoding, unicode_default_encoding) != 0) {
1240 PyErr_Format(PyExc_ValueError,
1241 "Can only set default encoding to %s",
1242 unicode_default_encoding);
1243 return -1;
1244 }
Fred Drakee4315f52000-05-09 19:53:39 +00001245 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001246}
1247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248/* error handling callback helper:
1249 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001250 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001251 and adjust various state variables.
1252 return 0 on success, -1 on error
1253*/
1254
1255static
1256int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1257 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001258 const char **input, const char **inend, Py_ssize_t *startinpos,
1259 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001260 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001262 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001263
1264 PyObject *restuple = NULL;
1265 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001266 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001267 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001268 Py_ssize_t requiredsize;
1269 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001270 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001271 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001272 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 int res = -1;
1274
1275 if (*errorHandler == NULL) {
1276 *errorHandler = PyCodec_LookupError(errors);
1277 if (*errorHandler == NULL)
1278 goto onError;
1279 }
1280
1281 if (*exceptionObject == NULL) {
1282 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001283 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001284 if (*exceptionObject == NULL)
1285 goto onError;
1286 }
1287 else {
1288 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1289 goto onError;
1290 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1291 goto onError;
1292 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1293 goto onError;
1294 }
1295
1296 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1297 if (restuple == NULL)
1298 goto onError;
1299 if (!PyTuple_Check(restuple)) {
1300 PyErr_Format(PyExc_TypeError, &argparse[4]);
1301 goto onError;
1302 }
1303 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1304 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001305
1306 /* Copy back the bytes variables, which might have been modified by the
1307 callback */
1308 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1309 if (!inputobj)
1310 goto onError;
1311 if (!PyBytes_Check(inputobj)) {
1312 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1313 }
1314 *input = PyBytes_AS_STRING(inputobj);
1315 insize = PyBytes_GET_SIZE(inputobj);
1316 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001317 /* we can DECREF safely, as the exception has another reference,
1318 so the object won't go away. */
1319 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001321 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001322 newpos = insize+newpos;
1323 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001324 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001325 goto onError;
1326 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327
1328 /* need more space? (at least enough for what we
1329 have+the replacement+the rest of the string (starting
1330 at the new input position), so we won't have to check space
1331 when there are no errors in the rest of the string) */
1332 repptr = PyUnicode_AS_UNICODE(repunicode);
1333 repsize = PyUnicode_GET_SIZE(repunicode);
1334 requiredsize = *outpos + repsize + insize-newpos;
1335 if (requiredsize > outsize) {
1336 if (requiredsize<2*outsize)
1337 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001338 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 goto onError;
1340 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1341 }
1342 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001343 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 Py_UNICODE_COPY(*outptr, repptr, repsize);
1345 *outptr += repsize;
1346 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001348 /* we made it! */
1349 res = 0;
1350
1351 onError:
1352 Py_XDECREF(restuple);
1353 return res;
1354}
1355
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001356/* --- UTF-7 Codec -------------------------------------------------------- */
1357
1358/* see RFC2152 for details */
1359
Tim Petersced69f82003-09-16 20:30:58 +00001360static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001361char utf7_special[128] = {
1362 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1363 encoded:
1364 0 - not special
1365 1 - special
1366 2 - whitespace (optional)
1367 3 - RFC2152 Set O (optional) */
1368 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1369 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1370 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1371 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1372 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1374 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1375 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1376
1377};
1378
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001379/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1380 warnings about the comparison always being false; since
1381 utf7_special[0] is 1, we can safely make that one comparison
1382 true */
1383
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001384#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001385 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001386 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001387 (encodeO && (utf7_special[(c)] == 3)))
1388
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001389#define B64(n) \
1390 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1391#define B64CHAR(c) \
1392 (isalnum(c) || (c) == '+' || (c) == '/')
1393#define UB64(c) \
1394 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1395 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001396
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001397#define ENCODE(out, ch, bits) \
1398 while (bits >= 6) { \
1399 *out++ = B64(ch >> (bits-6)); \
1400 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001401 }
1402
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001403#define DECODE(out, ch, bits, surrogate) \
1404 while (bits >= 16) { \
1405 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1406 bits -= 16; \
1407 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001408 /* We have already generated an error for the high surrogate \
1409 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001410 surrogate = 0; \
1411 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001412 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001413 it in a 16-bit character */ \
1414 surrogate = 1; \
1415 errmsg = "code pairs are not supported"; \
1416 goto utf7Error; \
1417 } else { \
1418 *out++ = outCh; \
1419 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001420 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001421
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001422PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001424 const char *errors)
1425{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001427 Py_ssize_t startinpos;
1428 Py_ssize_t endinpos;
1429 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430 const char *e;
1431 PyUnicodeObject *unicode;
1432 Py_UNICODE *p;
1433 const char *errmsg = "";
1434 int inShift = 0;
1435 unsigned int bitsleft = 0;
1436 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001437 int surrogate = 0;
1438 PyObject *errorHandler = NULL;
1439 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001440
1441 unicode = _PyUnicode_New(size);
1442 if (!unicode)
1443 return NULL;
1444 if (size == 0)
1445 return (PyObject *)unicode;
1446
1447 p = unicode->str;
1448 e = s + size;
1449
1450 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001451 Py_UNICODE ch;
1452 restart:
1453 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001454
1455 if (inShift) {
1456 if ((ch == '-') || !B64CHAR(ch)) {
1457 inShift = 0;
1458 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001459
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001460 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1461 if (bitsleft >= 6) {
1462 /* The shift sequence has a partial character in it. If
1463 bitsleft < 6 then we could just classify it as padding
1464 but that is not the case here */
1465
1466 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001467 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468 }
1469 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001470 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001471 here so indicate the potential of a misencoded character. */
1472
1473 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1474 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1475 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001476 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477 }
1478
1479 if (ch == '-') {
1480 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001481 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482 inShift = 1;
1483 }
1484 } else if (SPECIAL(ch,0,0)) {
1485 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001486 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001487 } else {
1488 *p++ = ch;
1489 }
1490 } else {
1491 charsleft = (charsleft << 6) | UB64(ch);
1492 bitsleft += 6;
1493 s++;
1494 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1495 }
1496 }
1497 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001499 s++;
1500 if (s < e && *s == '-') {
1501 s++;
1502 *p++ = '+';
1503 } else
1504 {
1505 inShift = 1;
1506 bitsleft = 0;
1507 }
1508 }
1509 else if (SPECIAL(ch,0,0)) {
1510 errmsg = "unexpected special character";
1511 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001512 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001513 }
1514 else {
1515 *p++ = ch;
1516 s++;
1517 }
1518 continue;
1519 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001520 outpos = p-PyUnicode_AS_UNICODE(unicode);
1521 endinpos = s-starts;
1522 if (unicode_decode_call_errorhandler(
1523 errors, &errorHandler,
1524 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001525 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 (PyObject **)&unicode, &outpos, &p))
1527 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001528 }
1529
1530 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001531 outpos = p-PyUnicode_AS_UNICODE(unicode);
1532 endinpos = size;
1533 if (unicode_decode_call_errorhandler(
1534 errors, &errorHandler,
1535 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001536 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001537 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001539 if (s < e)
1540 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001541 }
1542
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001543 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544 goto onError;
1545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 Py_XDECREF(errorHandler);
1547 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 return (PyObject *)unicode;
1549
1550onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 Py_XDECREF(errorHandler);
1552 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 Py_DECREF(unicode);
1554 return NULL;
1555}
1556
1557
1558PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001559 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560 int encodeSetO,
1561 int encodeWhiteSpace,
1562 const char *errors)
1563{
1564 PyObject *v;
1565 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001566 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001568 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001569 unsigned int bitsleft = 0;
1570 unsigned long charsleft = 0;
1571 char * out;
1572 char * start;
1573
1574 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001575 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576
Walter Dörwald51ab4142007-05-05 14:43:36 +00001577 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 if (v == NULL)
1579 return NULL;
1580
Walter Dörwald51ab4142007-05-05 14:43:36 +00001581 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001582 for (;i < size; ++i) {
1583 Py_UNICODE ch = s[i];
1584
1585 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001586 if (ch == '+') {
1587 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588 *out++ = '-';
1589 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1590 charsleft = ch;
1591 bitsleft = 16;
1592 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001593 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001595 } else {
1596 *out++ = (char) ch;
1597 }
1598 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1600 *out++ = B64(charsleft << (6-bitsleft));
1601 charsleft = 0;
1602 bitsleft = 0;
1603 /* Characters not in the BASE64 set implicitly unshift the sequence
1604 so no '-' is required, except if the character is itself a '-' */
1605 if (B64CHAR(ch) || ch == '-') {
1606 *out++ = '-';
1607 }
1608 inShift = 0;
1609 *out++ = (char) ch;
1610 } else {
1611 bitsleft += 16;
1612 charsleft = (charsleft << 16) | ch;
1613 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1614
1615 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001616 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617 or '-' then the shift sequence will be terminated implicitly and we
1618 don't have to insert a '-'. */
1619
1620 if (bitsleft == 0) {
1621 if (i + 1 < size) {
1622 Py_UNICODE ch2 = s[i+1];
1623
1624 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001625
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 } else if (B64CHAR(ch2) || ch2 == '-') {
1627 *out++ = '-';
1628 inShift = 0;
1629 } else {
1630 inShift = 0;
1631 }
1632
1633 }
1634 else {
1635 *out++ = '-';
1636 inShift = 0;
1637 }
1638 }
Tim Petersced69f82003-09-16 20:30:58 +00001639 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001641 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 if (bitsleft) {
1643 *out++= B64(charsleft << (6-bitsleft) );
1644 *out++ = '-';
1645 }
1646
Walter Dörwald51ab4142007-05-05 14:43:36 +00001647 if (PyBytes_Resize(v, out - start)) {
1648 Py_DECREF(v);
1649 return NULL;
1650 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 return v;
1652}
1653
1654#undef SPECIAL
1655#undef B64
1656#undef B64CHAR
1657#undef UB64
1658#undef ENCODE
1659#undef DECODE
1660
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661/* --- UTF-8 Codec -------------------------------------------------------- */
1662
Tim Petersced69f82003-09-16 20:30:58 +00001663static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664char utf8_code_length[256] = {
1665 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1666 illegal prefix. see RFC 2279 for details */
1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1670 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1671 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1679 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1680 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1681 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1682 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1683};
1684
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 const char *errors)
1688{
Walter Dörwald69652032004-09-07 20:24:22 +00001689 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1690}
1691
1692PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001693 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001694 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001695 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001696{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001697 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699 Py_ssize_t startinpos;
1700 Py_ssize_t endinpos;
1701 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702 const char *e;
1703 PyUnicodeObject *unicode;
1704 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001705 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 PyObject *errorHandler = NULL;
1707 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
1709 /* Note: size will always be longer than the resulting Unicode
1710 character count */
1711 unicode = _PyUnicode_New(size);
1712 if (!unicode)
1713 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001714 if (size == 0) {
1715 if (consumed)
1716 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719
1720 /* Unpack UTF-8 encoded data */
1721 p = unicode->str;
1722 e = s + size;
1723
1724 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001725 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726
1727 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001728 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 s++;
1730 continue;
1731 }
1732
1733 n = utf8_code_length[ch];
1734
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001735 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001736 if (consumed)
1737 break;
1738 else {
1739 errmsg = "unexpected end of data";
1740 startinpos = s-starts;
1741 endinpos = size;
1742 goto utf8Error;
1743 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745
1746 switch (n) {
1747
1748 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001749 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 startinpos = s-starts;
1751 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001752 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001755 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 startinpos = s-starts;
1757 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001758 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759
1760 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001761 if ((s[1] & 0xc0) != 0x80) {
1762 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001763 startinpos = s-starts;
1764 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001765 goto utf8Error;
1766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001768 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001769 startinpos = s-starts;
1770 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 errmsg = "illegal encoding";
1772 goto utf8Error;
1773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001775 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 break;
1777
1778 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001779 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 (s[2] & 0xc0) != 0x80) {
1781 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001782 startinpos = s-starts;
1783 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001784 goto utf8Error;
1785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001787 if (ch < 0x0800) {
1788 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001789 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001790
1791 XXX For wide builds (UCS-4) we should probably try
1792 to recombine the surrogates into a single code
1793 unit.
1794 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001795 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001796 startinpos = s-starts;
1797 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001798 goto utf8Error;
1799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001801 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001802 break;
1803
1804 case 4:
1805 if ((s[1] & 0xc0) != 0x80 ||
1806 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 (s[3] & 0xc0) != 0x80) {
1808 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 startinpos = s-starts;
1810 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001811 goto utf8Error;
1812 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001813 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1814 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1815 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001816 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001817 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001818 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001819 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001820 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 startinpos = s-starts;
1823 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001824 goto utf8Error;
1825 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001826#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001827 *p++ = (Py_UNICODE)ch;
1828#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001829 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001830
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001831 /* translate from 10000..10FFFF to 0..FFFF */
1832 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001833
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001834 /* high surrogate = top 10 bits added to D800 */
1835 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001836
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001837 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001838 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001839#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 break;
1841
1842 default:
1843 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 startinpos = s-starts;
1846 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 }
1849 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001851
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001852 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001853 outpos = p-PyUnicode_AS_UNICODE(unicode);
1854 if (unicode_decode_call_errorhandler(
1855 errors, &errorHandler,
1856 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001857 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 (PyObject **)&unicode, &outpos, &p))
1859 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 }
Walter Dörwald69652032004-09-07 20:24:22 +00001861 if (consumed)
1862 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863
1864 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001865 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 goto onError;
1867
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 Py_XDECREF(errorHandler);
1869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 return (PyObject *)unicode;
1871
1872onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 Py_XDECREF(errorHandler);
1874 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 Py_DECREF(unicode);
1876 return NULL;
1877}
1878
Tim Peters602f7402002-04-27 18:03:26 +00001879/* Allocation strategy: if the string is short, convert into a stack buffer
1880 and allocate exactly as much space needed at the end. Else allocate the
1881 maximum possible needed (4 result bytes per Unicode character), and return
1882 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001883*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001884PyObject *
1885PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001886 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888{
Tim Peters602f7402002-04-27 18:03:26 +00001889#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001890
Martin v. Löwis18e16552006-02-15 17:27:45 +00001891 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001892 PyObject *v; /* result string object */
1893 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001894 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001895 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001896 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001897
Tim Peters602f7402002-04-27 18:03:26 +00001898 assert(s != NULL);
1899 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900
Tim Peters602f7402002-04-27 18:03:26 +00001901 if (size <= MAX_SHORT_UNICHARS) {
1902 /* Write into the stack buffer; nallocated can't overflow.
1903 * At the end, we'll allocate exactly as much heap space as it
1904 * turns out we need.
1905 */
1906 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1907 v = NULL; /* will allocate after we're done */
1908 p = stackbuf;
1909 }
1910 else {
1911 /* Overallocate on the heap, and give the excess back at the end. */
1912 nallocated = size * 4;
1913 if (nallocated / 4 != size) /* overflow! */
1914 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001915 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001916 if (v == NULL)
1917 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001918 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001919 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001920
Tim Peters602f7402002-04-27 18:03:26 +00001921 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001922 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001923
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001924 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001925 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001927
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001929 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001930 *p++ = (char)(0xc0 | (ch >> 6));
1931 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001932 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001933 else {
Tim Peters602f7402002-04-27 18:03:26 +00001934 /* Encode UCS2 Unicode ordinals */
1935 if (ch < 0x10000) {
1936 /* Special case: check for high surrogate */
1937 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1938 Py_UCS4 ch2 = s[i];
1939 /* Check for low surrogate and combine the two to
1940 form a UCS4 value */
1941 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001942 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001943 i++;
1944 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001945 }
Tim Peters602f7402002-04-27 18:03:26 +00001946 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001947 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001948 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001949 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1950 *p++ = (char)(0x80 | (ch & 0x3f));
1951 continue;
1952 }
1953encodeUCS4:
1954 /* Encode UCS4 Unicode ordinals */
1955 *p++ = (char)(0xf0 | (ch >> 18));
1956 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1957 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1958 *p++ = (char)(0x80 | (ch & 0x3f));
1959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001961
Tim Peters602f7402002-04-27 18:03:26 +00001962 if (v == NULL) {
1963 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001964 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001965 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001966 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001967 }
1968 else {
1969 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001970 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001971 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001972 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001975
Tim Peters602f7402002-04-27 18:03:26 +00001976#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977}
1978
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1980{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 if (!PyUnicode_Check(unicode)) {
1982 PyErr_BadArgument();
1983 return NULL;
1984 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001985 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1986 PyUnicode_GET_SIZE(unicode),
1987 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988}
1989
Walter Dörwald41980ca2007-08-16 21:55:45 +00001990/* --- UTF-32 Codec ------------------------------------------------------- */
1991
1992PyObject *
1993PyUnicode_DecodeUTF32(const char *s,
1994 Py_ssize_t size,
1995 const char *errors,
1996 int *byteorder)
1997{
1998 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1999}
2000
2001PyObject *
2002PyUnicode_DecodeUTF32Stateful(const char *s,
2003 Py_ssize_t size,
2004 const char *errors,
2005 int *byteorder,
2006 Py_ssize_t *consumed)
2007{
2008 const char *starts = s;
2009 Py_ssize_t startinpos;
2010 Py_ssize_t endinpos;
2011 Py_ssize_t outpos;
2012 PyUnicodeObject *unicode;
2013 Py_UNICODE *p;
2014#ifndef Py_UNICODE_WIDE
2015 int i, pairs;
2016#else
2017 const int pairs = 0;
2018#endif
2019 const unsigned char *q, *e;
2020 int bo = 0; /* assume native ordering by default */
2021 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002022 /* Offsets from q for retrieving bytes in the right order. */
2023#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2024 int iorder[] = {0, 1, 2, 3};
2025#else
2026 int iorder[] = {3, 2, 1, 0};
2027#endif
2028 PyObject *errorHandler = NULL;
2029 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002030 /* On narrow builds we split characters outside the BMP into two
2031 codepoints => count how much extra space we need. */
2032#ifndef Py_UNICODE_WIDE
2033 for (i = pairs = 0; i < size/4; i++)
2034 if (((Py_UCS4 *)s)[i] >= 0x10000)
2035 pairs++;
2036#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002037
2038 /* This might be one to much, because of a BOM */
2039 unicode = _PyUnicode_New((size+3)/4+pairs);
2040 if (!unicode)
2041 return NULL;
2042 if (size == 0)
2043 return (PyObject *)unicode;
2044
2045 /* Unpack UTF-32 encoded data */
2046 p = unicode->str;
2047 q = (unsigned char *)s;
2048 e = q + size;
2049
2050 if (byteorder)
2051 bo = *byteorder;
2052
2053 /* Check for BOM marks (U+FEFF) in the input and adjust current
2054 byte order setting accordingly. In native mode, the leading BOM
2055 mark is skipped, in all other modes, it is copied to the output
2056 stream as-is (giving a ZWNBSP character). */
2057 if (bo == 0) {
2058 if (size >= 4) {
2059 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2060 (q[iorder[1]] << 8) | q[iorder[0]];
2061#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2062 if (bom == 0x0000FEFF) {
2063 q += 4;
2064 bo = -1;
2065 }
2066 else if (bom == 0xFFFE0000) {
2067 q += 4;
2068 bo = 1;
2069 }
2070#else
2071 if (bom == 0x0000FEFF) {
2072 q += 4;
2073 bo = 1;
2074 }
2075 else if (bom == 0xFFFE0000) {
2076 q += 4;
2077 bo = -1;
2078 }
2079#endif
2080 }
2081 }
2082
2083 if (bo == -1) {
2084 /* force LE */
2085 iorder[0] = 0;
2086 iorder[1] = 1;
2087 iorder[2] = 2;
2088 iorder[3] = 3;
2089 }
2090 else if (bo == 1) {
2091 /* force BE */
2092 iorder[0] = 3;
2093 iorder[1] = 2;
2094 iorder[2] = 1;
2095 iorder[3] = 0;
2096 }
2097
2098 while (q < e) {
2099 Py_UCS4 ch;
2100 /* remaining bytes at the end? (size should be divisible by 4) */
2101 if (e-q<4) {
2102 if (consumed)
2103 break;
2104 errmsg = "truncated data";
2105 startinpos = ((const char *)q)-starts;
2106 endinpos = ((const char *)e)-starts;
2107 goto utf32Error;
2108 /* The remaining input chars are ignored if the callback
2109 chooses to skip the input */
2110 }
2111 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2112 (q[iorder[1]] << 8) | q[iorder[0]];
2113
2114 if (ch >= 0x110000)
2115 {
2116 errmsg = "codepoint not in range(0x110000)";
2117 startinpos = ((const char *)q)-starts;
2118 endinpos = startinpos+4;
2119 goto utf32Error;
2120 }
2121#ifndef Py_UNICODE_WIDE
2122 if (ch >= 0x10000)
2123 {
2124 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2125 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2126 }
2127 else
2128#endif
2129 *p++ = ch;
2130 q += 4;
2131 continue;
2132 utf32Error:
2133 outpos = p-PyUnicode_AS_UNICODE(unicode);
2134 if (unicode_decode_call_errorhandler(
2135 errors, &errorHandler,
2136 "utf32", errmsg,
2137 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2138 (PyObject **)&unicode, &outpos, &p))
2139 goto onError;
2140 }
2141
2142 if (byteorder)
2143 *byteorder = bo;
2144
2145 if (consumed)
2146 *consumed = (const char *)q-starts;
2147
2148 /* Adjust length */
2149 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2150 goto onError;
2151
2152 Py_XDECREF(errorHandler);
2153 Py_XDECREF(exc);
2154 return (PyObject *)unicode;
2155
2156onError:
2157 Py_DECREF(unicode);
2158 Py_XDECREF(errorHandler);
2159 Py_XDECREF(exc);
2160 return NULL;
2161}
2162
2163PyObject *
2164PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2165 Py_ssize_t size,
2166 const char *errors,
2167 int byteorder)
2168{
2169 PyObject *v;
2170 unsigned char *p;
2171#ifndef Py_UNICODE_WIDE
2172 int i, pairs;
2173#else
2174 const int pairs = 0;
2175#endif
2176 /* Offsets from p for storing byte pairs in the right order. */
2177#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2178 int iorder[] = {0, 1, 2, 3};
2179#else
2180 int iorder[] = {3, 2, 1, 0};
2181#endif
2182
2183#define STORECHAR(CH) \
2184 do { \
2185 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2186 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2187 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2188 p[iorder[0]] = (CH) & 0xff; \
2189 p += 4; \
2190 } while(0)
2191
2192 /* In narrow builds we can output surrogate pairs as one codepoint,
2193 so we need less space. */
2194#ifndef Py_UNICODE_WIDE
2195 for (i = pairs = 0; i < size-1; i++)
2196 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2197 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2198 pairs++;
2199#endif
2200 v = PyBytes_FromStringAndSize(NULL,
2201 4 * (size - pairs + (byteorder == 0)));
2202 if (v == NULL)
2203 return NULL;
2204
2205 p = (unsigned char *)PyBytes_AS_STRING(v);
2206 if (byteorder == 0)
2207 STORECHAR(0xFEFF);
2208 if (size == 0)
2209 return v;
2210
2211 if (byteorder == -1) {
2212 /* force LE */
2213 iorder[0] = 0;
2214 iorder[1] = 1;
2215 iorder[2] = 2;
2216 iorder[3] = 3;
2217 }
2218 else if (byteorder == 1) {
2219 /* force BE */
2220 iorder[0] = 3;
2221 iorder[1] = 2;
2222 iorder[2] = 1;
2223 iorder[3] = 0;
2224 }
2225
2226 while (size-- > 0) {
2227 Py_UCS4 ch = *s++;
2228#ifndef Py_UNICODE_WIDE
2229 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2230 Py_UCS4 ch2 = *s;
2231 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2232 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2233 s++;
2234 size--;
2235 }
2236 }
2237#endif
2238 STORECHAR(ch);
2239 }
2240 return v;
2241#undef STORECHAR
2242}
2243
2244PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2245{
2246 if (!PyUnicode_Check(unicode)) {
2247 PyErr_BadArgument();
2248 return NULL;
2249 }
2250 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2251 PyUnicode_GET_SIZE(unicode),
2252 NULL,
2253 0);
2254}
2255
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256/* --- UTF-16 Codec ------------------------------------------------------- */
2257
Tim Peters772747b2001-08-09 22:21:55 +00002258PyObject *
2259PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002260 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002261 const char *errors,
2262 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263{
Walter Dörwald69652032004-09-07 20:24:22 +00002264 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2265}
2266
2267PyObject *
2268PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002269 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002270 const char *errors,
2271 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002272 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002274 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002275 Py_ssize_t startinpos;
2276 Py_ssize_t endinpos;
2277 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 PyUnicodeObject *unicode;
2279 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002280 const unsigned char *q, *e;
2281 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002282 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002283 /* Offsets from q for retrieving byte pairs in the right order. */
2284#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2285 int ihi = 1, ilo = 0;
2286#else
2287 int ihi = 0, ilo = 1;
2288#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002289 PyObject *errorHandler = NULL;
2290 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
2292 /* Note: size will always be longer than the resulting Unicode
2293 character count */
2294 unicode = _PyUnicode_New(size);
2295 if (!unicode)
2296 return NULL;
2297 if (size == 0)
2298 return (PyObject *)unicode;
2299
2300 /* Unpack UTF-16 encoded data */
2301 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002302 q = (unsigned char *)s;
2303 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304
2305 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002306 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002308 /* Check for BOM marks (U+FEFF) in the input and adjust current
2309 byte order setting accordingly. In native mode, the leading BOM
2310 mark is skipped, in all other modes, it is copied to the output
2311 stream as-is (giving a ZWNBSP character). */
2312 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002313 if (size >= 2) {
2314 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002315#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002316 if (bom == 0xFEFF) {
2317 q += 2;
2318 bo = -1;
2319 }
2320 else if (bom == 0xFFFE) {
2321 q += 2;
2322 bo = 1;
2323 }
Tim Petersced69f82003-09-16 20:30:58 +00002324#else
Walter Dörwald69652032004-09-07 20:24:22 +00002325 if (bom == 0xFEFF) {
2326 q += 2;
2327 bo = 1;
2328 }
2329 else if (bom == 0xFFFE) {
2330 q += 2;
2331 bo = -1;
2332 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002333#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002334 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336
Tim Peters772747b2001-08-09 22:21:55 +00002337 if (bo == -1) {
2338 /* force LE */
2339 ihi = 1;
2340 ilo = 0;
2341 }
2342 else if (bo == 1) {
2343 /* force BE */
2344 ihi = 0;
2345 ilo = 1;
2346 }
2347
2348 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002349 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002350 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002351 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002352 if (consumed)
2353 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002354 errmsg = "truncated data";
2355 startinpos = ((const char *)q)-starts;
2356 endinpos = ((const char *)e)-starts;
2357 goto utf16Error;
2358 /* The remaining input chars are ignored if the callback
2359 chooses to skip the input */
2360 }
2361 ch = (q[ihi] << 8) | q[ilo];
2362
Tim Peters772747b2001-08-09 22:21:55 +00002363 q += 2;
2364
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365 if (ch < 0xD800 || ch > 0xDFFF) {
2366 *p++ = ch;
2367 continue;
2368 }
2369
2370 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002371 if (q >= e) {
2372 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 startinpos = (((const char *)q)-2)-starts;
2374 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002375 goto utf16Error;
2376 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002377 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002378 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2379 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002380 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002381#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002382 *p++ = ch;
2383 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002384#else
2385 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002386#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002387 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002388 }
2389 else {
2390 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002391 startinpos = (((const char *)q)-4)-starts;
2392 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002393 goto utf16Error;
2394 }
2395
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002397 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 startinpos = (((const char *)q)-2)-starts;
2399 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 /* Fall through to report the error */
2401
2402 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002403 outpos = p-PyUnicode_AS_UNICODE(unicode);
2404 if (unicode_decode_call_errorhandler(
2405 errors, &errorHandler,
2406 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002407 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002408 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002409 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 }
2411
2412 if (byteorder)
2413 *byteorder = bo;
2414
Walter Dörwald69652032004-09-07 20:24:22 +00002415 if (consumed)
2416 *consumed = (const char *)q-starts;
2417
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002419 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420 goto onError;
2421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424 return (PyObject *)unicode;
2425
2426onError:
2427 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002428 Py_XDECREF(errorHandler);
2429 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430 return NULL;
2431}
2432
Tim Peters772747b2001-08-09 22:21:55 +00002433PyObject *
2434PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002435 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002436 const char *errors,
2437 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438{
2439 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002440 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002441#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002442 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002443#else
2444 const int pairs = 0;
2445#endif
Tim Peters772747b2001-08-09 22:21:55 +00002446 /* Offsets from p for storing byte pairs in the right order. */
2447#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2448 int ihi = 1, ilo = 0;
2449#else
2450 int ihi = 0, ilo = 1;
2451#endif
2452
2453#define STORECHAR(CH) \
2454 do { \
2455 p[ihi] = ((CH) >> 8) & 0xff; \
2456 p[ilo] = (CH) & 0xff; \
2457 p += 2; \
2458 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002460#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002461 for (i = pairs = 0; i < size; i++)
2462 if (s[i] >= 0x10000)
2463 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002464#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002465 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002466 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 if (v == NULL)
2468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469
Walter Dörwald3cc34522007-05-04 10:48:27 +00002470 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002472 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002473 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002474 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002475
2476 if (byteorder == -1) {
2477 /* force LE */
2478 ihi = 1;
2479 ilo = 0;
2480 }
2481 else if (byteorder == 1) {
2482 /* force BE */
2483 ihi = 0;
2484 ilo = 1;
2485 }
2486
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002487 while (size-- > 0) {
2488 Py_UNICODE ch = *s++;
2489 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002490#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002491 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002492 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2493 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002495#endif
Tim Peters772747b2001-08-09 22:21:55 +00002496 STORECHAR(ch);
2497 if (ch2)
2498 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002499 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002501#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502}
2503
2504PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2505{
2506 if (!PyUnicode_Check(unicode)) {
2507 PyErr_BadArgument();
2508 return NULL;
2509 }
2510 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2511 PyUnicode_GET_SIZE(unicode),
2512 NULL,
2513 0);
2514}
2515
2516/* --- Unicode Escape Codec ----------------------------------------------- */
2517
Fredrik Lundh06d12682001-01-24 07:59:11 +00002518static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002519
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002521 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 const char *errors)
2523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002525 Py_ssize_t startinpos;
2526 Py_ssize_t endinpos;
2527 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002528 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002530 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002532 char* message;
2533 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 PyObject *errorHandler = NULL;
2535 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002536
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 /* Escaped strings will always be longer than the resulting
2538 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539 length after conversion to the true value.
2540 (but if the error callback returns a long replacement string
2541 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542 v = _PyUnicode_New(size);
2543 if (v == NULL)
2544 goto onError;
2545 if (size == 0)
2546 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002547
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002550
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 while (s < end) {
2552 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002553 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555
2556 /* Non-escape characters are interpreted as Unicode ordinals */
2557 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002558 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 continue;
2560 }
2561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 /* \ - Escapes */
2564 s++;
2565 switch (*s++) {
2566
2567 /* \x escapes */
2568 case '\n': break;
2569 case '\\': *p++ = '\\'; break;
2570 case '\'': *p++ = '\''; break;
2571 case '\"': *p++ = '\"'; break;
2572 case 'b': *p++ = '\b'; break;
2573 case 'f': *p++ = '\014'; break; /* FF */
2574 case 't': *p++ = '\t'; break;
2575 case 'n': *p++ = '\n'; break;
2576 case 'r': *p++ = '\r'; break;
2577 case 'v': *p++ = '\013'; break; /* VT */
2578 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2579
2580 /* \OOO (octal) escapes */
2581 case '0': case '1': case '2': case '3':
2582 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002583 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002585 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002587 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002589 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590 break;
2591
Fredrik Lundhccc74732001-02-18 22:13:49 +00002592 /* hex escapes */
2593 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002595 digits = 2;
2596 message = "truncated \\xXX escape";
2597 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598
Fredrik Lundhccc74732001-02-18 22:13:49 +00002599 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002601 digits = 4;
2602 message = "truncated \\uXXXX escape";
2603 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604
Fredrik Lundhccc74732001-02-18 22:13:49 +00002605 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002606 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002607 digits = 8;
2608 message = "truncated \\UXXXXXXXX escape";
2609 hexescape:
2610 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002611 outpos = p-PyUnicode_AS_UNICODE(v);
2612 if (s+digits>end) {
2613 endinpos = size;
2614 if (unicode_decode_call_errorhandler(
2615 errors, &errorHandler,
2616 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002617 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002618 (PyObject **)&v, &outpos, &p))
2619 goto onError;
2620 goto nextByte;
2621 }
2622 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002623 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002624 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625 endinpos = (s+i+1)-starts;
2626 if (unicode_decode_call_errorhandler(
2627 errors, &errorHandler,
2628 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002629 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002630 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002631 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002632 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002633 }
2634 chr = (chr<<4) & ~0xF;
2635 if (c >= '0' && c <= '9')
2636 chr += c - '0';
2637 else if (c >= 'a' && c <= 'f')
2638 chr += 10 + c - 'a';
2639 else
2640 chr += 10 + c - 'A';
2641 }
2642 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002643 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002644 /* _decoding_error will have already written into the
2645 target buffer. */
2646 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002647 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002648 /* when we get here, chr is a 32-bit unicode character */
2649 if (chr <= 0xffff)
2650 /* UCS-2 character */
2651 *p++ = (Py_UNICODE) chr;
2652 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002653 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002654 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002655#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002656 *p++ = chr;
2657#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002658 chr -= 0x10000L;
2659 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002660 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002661#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002662 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 endinpos = s-starts;
2664 outpos = p-PyUnicode_AS_UNICODE(v);
2665 if (unicode_decode_call_errorhandler(
2666 errors, &errorHandler,
2667 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002668 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002669 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002670 goto onError;
2671 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002672 break;
2673
2674 /* \N{name} */
2675 case 'N':
2676 message = "malformed \\N character escape";
2677 if (ucnhash_CAPI == NULL) {
2678 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002679 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002680 m = PyImport_ImportModule("unicodedata");
2681 if (m == NULL)
2682 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002683 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002684 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002685 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002686 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002687 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002688 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002689 if (ucnhash_CAPI == NULL)
2690 goto ucnhashError;
2691 }
2692 if (*s == '{') {
2693 const char *start = s+1;
2694 /* look for the closing brace */
2695 while (*s != '}' && s < end)
2696 s++;
2697 if (s > start && s < end && *s == '}') {
2698 /* found a name. look it up in the unicode database */
2699 message = "unknown Unicode character name";
2700 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002701 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002702 goto store;
2703 }
2704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 endinpos = s-starts;
2706 outpos = p-PyUnicode_AS_UNICODE(v);
2707 if (unicode_decode_call_errorhandler(
2708 errors, &errorHandler,
2709 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002710 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002712 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002713 break;
2714
2715 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002716 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 message = "\\ at end of string";
2718 s--;
2719 endinpos = s-starts;
2720 outpos = p-PyUnicode_AS_UNICODE(v);
2721 if (unicode_decode_call_errorhandler(
2722 errors, &errorHandler,
2723 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002724 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002726 goto onError;
2727 }
2728 else {
2729 *p++ = '\\';
2730 *p++ = (unsigned char)s[-1];
2731 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002732 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 nextByte:
2735 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002737 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002739 Py_XDECREF(errorHandler);
2740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002742
Fredrik Lundhccc74732001-02-18 22:13:49 +00002743ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002744 PyErr_SetString(
2745 PyExc_UnicodeError,
2746 "\\N escapes not supported (can't load unicodedata module)"
2747 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002748 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 Py_XDECREF(errorHandler);
2750 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002751 return NULL;
2752
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755 Py_XDECREF(errorHandler);
2756 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 return NULL;
2758}
2759
2760/* Return a Unicode-Escape string version of the Unicode object.
2761
2762 If quotes is true, the string is enclosed in u"" or u'' quotes as
2763 appropriate.
2764
2765*/
2766
Thomas Wouters477c8d52006-05-27 19:21:47 +00002767Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2768 Py_ssize_t size,
2769 Py_UNICODE ch)
2770{
2771 /* like wcschr, but doesn't stop at NULL characters */
2772
2773 while (size-- > 0) {
2774 if (*s == ch)
2775 return s;
2776 s++;
2777 }
2778
2779 return NULL;
2780}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002781
Walter Dörwald79e913e2007-05-12 11:08:06 +00002782static const char *hexdigits = "0123456789abcdef";
2783
2784PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2785 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786{
2787 PyObject *repr;
2788 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789
Thomas Wouters89f507f2006-12-13 04:49:30 +00002790 /* XXX(nnorwitz): rather than over-allocating, it would be
2791 better to choose a different scheme. Perhaps scan the
2792 first N-chars of the string and allocate based on that size.
2793 */
2794 /* Initial allocation is based on the longest-possible unichr
2795 escape.
2796
2797 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2798 unichr, so in this case it's the longest unichr escape. In
2799 narrow (UTF-16) builds this is five chars per source unichr
2800 since there are two unichrs in the surrogate pair, so in narrow
2801 (UTF-16) builds it's not the longest unichr escape.
2802
2803 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2804 so in the narrow (UTF-16) build case it's the longest unichr
2805 escape.
2806 */
2807
Walter Dörwald79e913e2007-05-12 11:08:06 +00002808 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002809#ifdef Py_UNICODE_WIDE
2810 + 10*size
2811#else
2812 + 6*size
2813#endif
2814 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 if (repr == NULL)
2816 return NULL;
2817
Walter Dörwald79e913e2007-05-12 11:08:06 +00002818 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 while (size-- > 0) {
2821 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002822
Walter Dörwald79e913e2007-05-12 11:08:06 +00002823 /* Escape backslashes */
2824 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 *p++ = '\\';
2826 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002827 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002828 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002829
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002830#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002831 /* Map 21-bit characters to '\U00xxxxxx' */
2832 else if (ch >= 0x10000) {
2833 *p++ = '\\';
2834 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002835 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2836 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2837 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2838 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2839 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2840 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2841 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2842 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002843 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002844 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002845#else
2846 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002847 else if (ch >= 0xD800 && ch < 0xDC00) {
2848 Py_UNICODE ch2;
2849 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002850
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002851 ch2 = *s++;
2852 size--;
2853 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2854 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2855 *p++ = '\\';
2856 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002857 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2858 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2859 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2860 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2861 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2862 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2863 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2864 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002865 continue;
2866 }
2867 /* Fall through: isolated surrogates are copied as-is */
2868 s--;
2869 size++;
2870 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002871#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002872
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002874 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 *p++ = '\\';
2876 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002877 *p++ = hexdigits[(ch >> 12) & 0x000F];
2878 *p++ = hexdigits[(ch >> 8) & 0x000F];
2879 *p++ = hexdigits[(ch >> 4) & 0x000F];
2880 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002882
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002883 /* Map special whitespace to '\t', \n', '\r' */
2884 else if (ch == '\t') {
2885 *p++ = '\\';
2886 *p++ = 't';
2887 }
2888 else if (ch == '\n') {
2889 *p++ = '\\';
2890 *p++ = 'n';
2891 }
2892 else if (ch == '\r') {
2893 *p++ = '\\';
2894 *p++ = 'r';
2895 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002896
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002897 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002898 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002900 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002901 *p++ = hexdigits[(ch >> 4) & 0x000F];
2902 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002903 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002904
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 /* Copy everything else as-is */
2906 else
2907 *p++ = (char) ch;
2908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909
2910 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002911 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2912 Py_DECREF(repr);
2913 return NULL;
2914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 return repr;
2916}
2917
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2919{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002920 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 if (!PyUnicode_Check(unicode)) {
2922 PyErr_BadArgument();
2923 return NULL;
2924 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002925 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2926 PyUnicode_GET_SIZE(unicode));
2927
2928 if (!s)
2929 return NULL;
2930 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2931 PyBytes_GET_SIZE(s));
2932 Py_DECREF(s);
2933 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934}
2935
2936/* --- Raw Unicode Escape Codec ------------------------------------------- */
2937
2938PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002939 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 const char *errors)
2941{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002942 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002943 Py_ssize_t startinpos;
2944 Py_ssize_t endinpos;
2945 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948 const char *end;
2949 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002950 PyObject *errorHandler = NULL;
2951 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002952
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 /* Escaped strings will always be longer than the resulting
2954 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002955 length after conversion to the true value. (But decoding error
2956 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957 v = _PyUnicode_New(size);
2958 if (v == NULL)
2959 goto onError;
2960 if (size == 0)
2961 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002962 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 end = s + size;
2964 while (s < end) {
2965 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002966 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002968 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969
2970 /* Non-escape characters are interpreted as Unicode ordinals */
2971 if (*s != '\\') {
2972 *p++ = (unsigned char)*s++;
2973 continue;
2974 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002975 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976
2977 /* \u-escapes are only interpreted iff the number of leading
2978 backslashes if odd */
2979 bs = s;
2980 for (;s < end;) {
2981 if (*s != '\\')
2982 break;
2983 *p++ = (unsigned char)*s++;
2984 }
2985 if (((s - bs) & 1) == 0 ||
2986 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002987 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 continue;
2989 }
2990 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002991 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 s++;
2993
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002994 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002995 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002996 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002999 endinpos = s-starts;
3000 if (unicode_decode_call_errorhandler(
3001 errors, &errorHandler,
3002 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003003 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 }
3008 x = (x<<4) & ~0xF;
3009 if (c >= '0' && c <= '9')
3010 x += c - '0';
3011 else if (c >= 'a' && c <= 'f')
3012 x += 10 + c - 'a';
3013 else
3014 x += 10 + c - 'A';
3015 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003016#ifndef Py_UNICODE_WIDE
3017 if (x > 0x10000) {
3018 if (unicode_decode_call_errorhandler(
3019 errors, &errorHandler,
3020 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003021 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003022 (PyObject **)&v, &outpos, &p))
3023 goto onError;
3024 }
3025#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 *p++ = x;
3027 nextByte:
3028 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003030 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003031 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 Py_XDECREF(errorHandler);
3033 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003035
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 onError:
3037 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 Py_XDECREF(errorHandler);
3039 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 return NULL;
3041}
3042
3043PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003044 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045{
3046 PyObject *repr;
3047 char *p;
3048 char *q;
3049
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003050#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003051 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003052#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003053 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003054#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 if (repr == NULL)
3056 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003057 if (size == 0)
3058 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059
Walter Dörwald711005d2007-05-12 12:03:26 +00003060 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 while (size-- > 0) {
3062 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003063#ifdef Py_UNICODE_WIDE
3064 /* Map 32-bit characters to '\Uxxxxxxxx' */
3065 if (ch >= 0x10000) {
3066 *p++ = '\\';
3067 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003068 *p++ = hexdigits[(ch >> 28) & 0xf];
3069 *p++ = hexdigits[(ch >> 24) & 0xf];
3070 *p++ = hexdigits[(ch >> 20) & 0xf];
3071 *p++ = hexdigits[(ch >> 16) & 0xf];
3072 *p++ = hexdigits[(ch >> 12) & 0xf];
3073 *p++ = hexdigits[(ch >> 8) & 0xf];
3074 *p++ = hexdigits[(ch >> 4) & 0xf];
3075 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003076 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003077 else
3078#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 /* Map 16-bit characters to '\uxxxx' */
3080 if (ch >= 256) {
3081 *p++ = '\\';
3082 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003083 *p++ = hexdigits[(ch >> 12) & 0xf];
3084 *p++ = hexdigits[(ch >> 8) & 0xf];
3085 *p++ = hexdigits[(ch >> 4) & 0xf];
3086 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 }
3088 /* Copy everything else as-is */
3089 else
3090 *p++ = (char) ch;
3091 }
3092 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003093 if (PyBytes_Resize(repr, p - q)) {
3094 Py_DECREF(repr);
3095 return NULL;
3096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 return repr;
3098}
3099
3100PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3101{
Walter Dörwald711005d2007-05-12 12:03:26 +00003102 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003104 PyErr_BadArgument();
3105 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003107 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3108 PyUnicode_GET_SIZE(unicode));
3109
3110 if (!s)
3111 return NULL;
3112 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3113 PyBytes_GET_SIZE(s));
3114 Py_DECREF(s);
3115 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116}
3117
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003118/* --- Unicode Internal Codec ------------------------------------------- */
3119
3120PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003121 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003122 const char *errors)
3123{
3124 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003125 Py_ssize_t startinpos;
3126 Py_ssize_t endinpos;
3127 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003128 PyUnicodeObject *v;
3129 Py_UNICODE *p;
3130 const char *end;
3131 const char *reason;
3132 PyObject *errorHandler = NULL;
3133 PyObject *exc = NULL;
3134
Neal Norwitzd43069c2006-01-08 01:12:10 +00003135#ifdef Py_UNICODE_WIDE
3136 Py_UNICODE unimax = PyUnicode_GetMax();
3137#endif
3138
Thomas Wouters89f507f2006-12-13 04:49:30 +00003139 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003140 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3141 if (v == NULL)
3142 goto onError;
3143 if (PyUnicode_GetSize((PyObject *)v) == 0)
3144 return (PyObject *)v;
3145 p = PyUnicode_AS_UNICODE(v);
3146 end = s + size;
3147
3148 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003149 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003150 /* We have to sanity check the raw data, otherwise doom looms for
3151 some malformed UCS-4 data. */
3152 if (
3153 #ifdef Py_UNICODE_WIDE
3154 *p > unimax || *p < 0 ||
3155 #endif
3156 end-s < Py_UNICODE_SIZE
3157 )
3158 {
3159 startinpos = s - starts;
3160 if (end-s < Py_UNICODE_SIZE) {
3161 endinpos = end-starts;
3162 reason = "truncated input";
3163 }
3164 else {
3165 endinpos = s - starts + Py_UNICODE_SIZE;
3166 reason = "illegal code point (> 0x10FFFF)";
3167 }
3168 outpos = p - PyUnicode_AS_UNICODE(v);
3169 if (unicode_decode_call_errorhandler(
3170 errors, &errorHandler,
3171 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003172 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003173 (PyObject **)&v, &outpos, &p)) {
3174 goto onError;
3175 }
3176 }
3177 else {
3178 p++;
3179 s += Py_UNICODE_SIZE;
3180 }
3181 }
3182
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003183 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003184 goto onError;
3185 Py_XDECREF(errorHandler);
3186 Py_XDECREF(exc);
3187 return (PyObject *)v;
3188
3189 onError:
3190 Py_XDECREF(v);
3191 Py_XDECREF(errorHandler);
3192 Py_XDECREF(exc);
3193 return NULL;
3194}
3195
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196/* --- Latin-1 Codec ------------------------------------------------------ */
3197
3198PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003199 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 const char *errors)
3201{
3202 PyUnicodeObject *v;
3203 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003204
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003206 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003207 Py_UNICODE r = *(unsigned char*)s;
3208 return PyUnicode_FromUnicode(&r, 1);
3209 }
3210
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 v = _PyUnicode_New(size);
3212 if (v == NULL)
3213 goto onError;
3214 if (size == 0)
3215 return (PyObject *)v;
3216 p = PyUnicode_AS_UNICODE(v);
3217 while (size-- > 0)
3218 *p++ = (unsigned char)*s++;
3219 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003220
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 onError:
3222 Py_XDECREF(v);
3223 return NULL;
3224}
3225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226/* create or adjust a UnicodeEncodeError */
3227static void make_encode_exception(PyObject **exceptionObject,
3228 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003229 const Py_UNICODE *unicode, Py_ssize_t size,
3230 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003233 if (*exceptionObject == NULL) {
3234 *exceptionObject = PyUnicodeEncodeError_Create(
3235 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 }
3237 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3239 goto onError;
3240 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3241 goto onError;
3242 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3243 goto onError;
3244 return;
3245 onError:
3246 Py_DECREF(*exceptionObject);
3247 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 }
3249}
3250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251/* raises a UnicodeEncodeError */
3252static void raise_encode_exception(PyObject **exceptionObject,
3253 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003254 const Py_UNICODE *unicode, Py_ssize_t size,
3255 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 const char *reason)
3257{
3258 make_encode_exception(exceptionObject,
3259 encoding, unicode, size, startpos, endpos, reason);
3260 if (*exceptionObject != NULL)
3261 PyCodec_StrictErrors(*exceptionObject);
3262}
3263
3264/* error handling callback helper:
3265 build arguments, call the callback and check the arguments,
3266 put the result into newpos and return the replacement string, which
3267 has to be freed by the caller */
3268static PyObject *unicode_encode_call_errorhandler(const char *errors,
3269 PyObject **errorHandler,
3270 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003271 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3272 Py_ssize_t startpos, Py_ssize_t endpos,
3273 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003275 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276
3277 PyObject *restuple;
3278 PyObject *resunicode;
3279
3280 if (*errorHandler == NULL) {
3281 *errorHandler = PyCodec_LookupError(errors);
3282 if (*errorHandler == NULL)
3283 return NULL;
3284 }
3285
3286 make_encode_exception(exceptionObject,
3287 encoding, unicode, size, startpos, endpos, reason);
3288 if (*exceptionObject == NULL)
3289 return NULL;
3290
3291 restuple = PyObject_CallFunctionObjArgs(
3292 *errorHandler, *exceptionObject, NULL);
3293 if (restuple == NULL)
3294 return NULL;
3295 if (!PyTuple_Check(restuple)) {
3296 PyErr_Format(PyExc_TypeError, &argparse[4]);
3297 Py_DECREF(restuple);
3298 return NULL;
3299 }
3300 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3301 &resunicode, newpos)) {
3302 Py_DECREF(restuple);
3303 return NULL;
3304 }
3305 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003306 *newpos = size+*newpos;
3307 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003308 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003309 Py_DECREF(restuple);
3310 return NULL;
3311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312 Py_INCREF(resunicode);
3313 Py_DECREF(restuple);
3314 return resunicode;
3315}
3316
3317static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003318 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 const char *errors,
3320 int limit)
3321{
3322 /* output object */
3323 PyObject *res;
3324 /* pointers to the beginning and end+1 of input */
3325 const Py_UNICODE *startp = p;
3326 const Py_UNICODE *endp = p + size;
3327 /* pointer to the beginning of the unencodable characters */
3328 /* const Py_UNICODE *badp = NULL; */
3329 /* pointer into the output */
3330 char *str;
3331 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003332 Py_ssize_t respos = 0;
3333 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003334 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3335 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336 PyObject *errorHandler = NULL;
3337 PyObject *exc = NULL;
3338 /* the following variable is used for caching string comparisons
3339 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3340 int known_errorHandler = -1;
3341
3342 /* allocate enough for a simple encoding without
3343 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003344 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 if (res == NULL)
3346 goto onError;
3347 if (size == 0)
3348 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003349 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350 ressize = size;
3351
3352 while (p<endp) {
3353 Py_UNICODE c = *p;
3354
3355 /* can we encode this? */
3356 if (c<limit) {
3357 /* no overflow check, because we know that the space is enough */
3358 *str++ = (char)c;
3359 ++p;
3360 }
3361 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003362 Py_ssize_t unicodepos = p-startp;
3363 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003365 Py_ssize_t repsize;
3366 Py_ssize_t newpos;
3367 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368 Py_UNICODE *uni2;
3369 /* startpos for collecting unencodable chars */
3370 const Py_UNICODE *collstart = p;
3371 const Py_UNICODE *collend = p;
3372 /* find all unecodable characters */
3373 while ((collend < endp) && ((*collend)>=limit))
3374 ++collend;
3375 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3376 if (known_errorHandler==-1) {
3377 if ((errors==NULL) || (!strcmp(errors, "strict")))
3378 known_errorHandler = 1;
3379 else if (!strcmp(errors, "replace"))
3380 known_errorHandler = 2;
3381 else if (!strcmp(errors, "ignore"))
3382 known_errorHandler = 3;
3383 else if (!strcmp(errors, "xmlcharrefreplace"))
3384 known_errorHandler = 4;
3385 else
3386 known_errorHandler = 0;
3387 }
3388 switch (known_errorHandler) {
3389 case 1: /* strict */
3390 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3391 goto onError;
3392 case 2: /* replace */
3393 while (collstart++<collend)
3394 *str++ = '?'; /* fall through */
3395 case 3: /* ignore */
3396 p = collend;
3397 break;
3398 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003399 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 /* determine replacement size (temporarily (mis)uses p) */
3401 for (p = collstart, repsize = 0; p < collend; ++p) {
3402 if (*p<10)
3403 repsize += 2+1+1;
3404 else if (*p<100)
3405 repsize += 2+2+1;
3406 else if (*p<1000)
3407 repsize += 2+3+1;
3408 else if (*p<10000)
3409 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003410#ifndef Py_UNICODE_WIDE
3411 else
3412 repsize += 2+5+1;
3413#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414 else if (*p<100000)
3415 repsize += 2+5+1;
3416 else if (*p<1000000)
3417 repsize += 2+6+1;
3418 else
3419 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003420#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 }
3422 requiredsize = respos+repsize+(endp-collend);
3423 if (requiredsize > ressize) {
3424 if (requiredsize<2*ressize)
3425 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003426 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003428 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 ressize = requiredsize;
3430 }
3431 /* generate replacement (temporarily (mis)uses p) */
3432 for (p = collstart; p < collend; ++p) {
3433 str += sprintf(str, "&#%d;", (int)*p);
3434 }
3435 p = collend;
3436 break;
3437 default:
3438 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3439 encoding, reason, startp, size, &exc,
3440 collstart-startp, collend-startp, &newpos);
3441 if (repunicode == NULL)
3442 goto onError;
3443 /* need more space? (at least enough for what we
3444 have+the replacement+the rest of the string, so
3445 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003446 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 repsize = PyUnicode_GET_SIZE(repunicode);
3448 requiredsize = respos+repsize+(endp-collend);
3449 if (requiredsize > ressize) {
3450 if (requiredsize<2*ressize)
3451 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003452 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 Py_DECREF(repunicode);
3454 goto onError;
3455 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003456 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 ressize = requiredsize;
3458 }
3459 /* check if there is anything unencodable in the replacement
3460 and copy it to the output */
3461 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3462 c = *uni2;
3463 if (c >= limit) {
3464 raise_encode_exception(&exc, encoding, startp, size,
3465 unicodepos, unicodepos+1, reason);
3466 Py_DECREF(repunicode);
3467 goto onError;
3468 }
3469 *str = (char)c;
3470 }
3471 p = startp + newpos;
3472 Py_DECREF(repunicode);
3473 }
3474 }
3475 }
3476 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003477 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 if (respos<ressize)
3479 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003480 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 Py_XDECREF(errorHandler);
3482 Py_XDECREF(exc);
3483 return res;
3484
3485 onError:
3486 Py_XDECREF(res);
3487 Py_XDECREF(errorHandler);
3488 Py_XDECREF(exc);
3489 return NULL;
3490}
3491
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003493 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 const char *errors)
3495{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497}
3498
3499PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3500{
3501 if (!PyUnicode_Check(unicode)) {
3502 PyErr_BadArgument();
3503 return NULL;
3504 }
3505 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3506 PyUnicode_GET_SIZE(unicode),
3507 NULL);
3508}
3509
3510/* --- 7-bit ASCII Codec -------------------------------------------------- */
3511
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003513 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 const char *errors)
3515{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517 PyUnicodeObject *v;
3518 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003519 Py_ssize_t startinpos;
3520 Py_ssize_t endinpos;
3521 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 const char *e;
3523 PyObject *errorHandler = NULL;
3524 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003525
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003527 if (size == 1 && *(unsigned char*)s < 128) {
3528 Py_UNICODE r = *(unsigned char*)s;
3529 return PyUnicode_FromUnicode(&r, 1);
3530 }
Tim Petersced69f82003-09-16 20:30:58 +00003531
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 v = _PyUnicode_New(size);
3533 if (v == NULL)
3534 goto onError;
3535 if (size == 0)
3536 return (PyObject *)v;
3537 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 e = s + size;
3539 while (s < e) {
3540 register unsigned char c = (unsigned char)*s;
3541 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 ++s;
3544 }
3545 else {
3546 startinpos = s-starts;
3547 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003548 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 if (unicode_decode_call_errorhandler(
3550 errors, &errorHandler,
3551 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003552 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003557 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003558 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003559 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 Py_XDECREF(errorHandler);
3561 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003563
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 onError:
3565 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 Py_XDECREF(errorHandler);
3567 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 return NULL;
3569}
3570
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003572 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 const char *errors)
3574{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576}
3577
3578PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3579{
3580 if (!PyUnicode_Check(unicode)) {
3581 PyErr_BadArgument();
3582 return NULL;
3583 }
3584 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3585 PyUnicode_GET_SIZE(unicode),
3586 NULL);
3587}
3588
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003589#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003590
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003591/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003592
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003593#if SIZEOF_INT < SIZEOF_SSIZE_T
3594#define NEED_RETRY
3595#endif
3596
3597/* XXX This code is limited to "true" double-byte encodings, as
3598 a) it assumes an incomplete character consists of a single byte, and
3599 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3600 encodings, see IsDBCSLeadByteEx documentation. */
3601
3602static int is_dbcs_lead_byte(const char *s, int offset)
3603{
3604 const char *curr = s + offset;
3605
3606 if (IsDBCSLeadByte(*curr)) {
3607 const char *prev = CharPrev(s, curr);
3608 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3609 }
3610 return 0;
3611}
3612
3613/*
3614 * Decode MBCS string into unicode object. If 'final' is set, converts
3615 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3616 */
3617static int decode_mbcs(PyUnicodeObject **v,
3618 const char *s, /* MBCS string */
3619 int size, /* sizeof MBCS string */
3620 int final)
3621{
3622 Py_UNICODE *p;
3623 Py_ssize_t n = 0;
3624 int usize = 0;
3625
3626 assert(size >= 0);
3627
3628 /* Skip trailing lead-byte unless 'final' is set */
3629 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3630 --size;
3631
3632 /* First get the size of the result */
3633 if (size > 0) {
3634 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3635 if (usize == 0) {
3636 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3637 return -1;
3638 }
3639 }
3640
3641 if (*v == NULL) {
3642 /* Create unicode object */
3643 *v = _PyUnicode_New(usize);
3644 if (*v == NULL)
3645 return -1;
3646 }
3647 else {
3648 /* Extend unicode object */
3649 n = PyUnicode_GET_SIZE(*v);
3650 if (_PyUnicode_Resize(v, n + usize) < 0)
3651 return -1;
3652 }
3653
3654 /* Do the conversion */
3655 if (size > 0) {
3656 p = PyUnicode_AS_UNICODE(*v) + n;
3657 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3658 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3659 return -1;
3660 }
3661 }
3662
3663 return size;
3664}
3665
3666PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3667 Py_ssize_t size,
3668 const char *errors,
3669 Py_ssize_t *consumed)
3670{
3671 PyUnicodeObject *v = NULL;
3672 int done;
3673
3674 if (consumed)
3675 *consumed = 0;
3676
3677#ifdef NEED_RETRY
3678 retry:
3679 if (size > INT_MAX)
3680 done = decode_mbcs(&v, s, INT_MAX, 0);
3681 else
3682#endif
3683 done = decode_mbcs(&v, s, (int)size, !consumed);
3684
3685 if (done < 0) {
3686 Py_XDECREF(v);
3687 return NULL;
3688 }
3689
3690 if (consumed)
3691 *consumed += done;
3692
3693#ifdef NEED_RETRY
3694 if (size > INT_MAX) {
3695 s += done;
3696 size -= done;
3697 goto retry;
3698 }
3699#endif
3700
3701 return (PyObject *)v;
3702}
3703
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003704PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003705 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003706 const char *errors)
3707{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003708 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3709}
3710
3711/*
3712 * Convert unicode into string object (MBCS).
3713 * Returns 0 if succeed, -1 otherwise.
3714 */
3715static int encode_mbcs(PyObject **repr,
3716 const Py_UNICODE *p, /* unicode */
3717 int size) /* size of unicode */
3718{
3719 int mbcssize = 0;
3720 Py_ssize_t n = 0;
3721
3722 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003723
3724 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003725 if (size > 0) {
3726 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3727 if (mbcssize == 0) {
3728 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3729 return -1;
3730 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003731 }
3732
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003733 if (*repr == NULL) {
3734 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003735 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003736 if (*repr == NULL)
3737 return -1;
3738 }
3739 else {
3740 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003741 n = PyBytes_Size(*repr);
3742 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003743 return -1;
3744 }
3745
3746 /* Do the conversion */
3747 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003748 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003749 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3750 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3751 return -1;
3752 }
3753 }
3754
3755 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003756}
3757
3758PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003759 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003760 const char *errors)
3761{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003762 PyObject *repr = NULL;
3763 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003764
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003765#ifdef NEED_RETRY
3766 retry:
3767 if (size > INT_MAX)
3768 ret = encode_mbcs(&repr, p, INT_MAX);
3769 else
3770#endif
3771 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003772
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003773 if (ret < 0) {
3774 Py_XDECREF(repr);
3775 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003776 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003777
3778#ifdef NEED_RETRY
3779 if (size > INT_MAX) {
3780 p += INT_MAX;
3781 size -= INT_MAX;
3782 goto retry;
3783 }
3784#endif
3785
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003786 return repr;
3787}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003788
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003789PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3790{
3791 if (!PyUnicode_Check(unicode)) {
3792 PyErr_BadArgument();
3793 return NULL;
3794 }
3795 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3796 PyUnicode_GET_SIZE(unicode),
3797 NULL);
3798}
3799
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003800#undef NEED_RETRY
3801
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003802#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003803
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804/* --- Character Mapping Codec -------------------------------------------- */
3805
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003807 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 PyObject *mapping,
3809 const char *errors)
3810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003812 Py_ssize_t startinpos;
3813 Py_ssize_t endinpos;
3814 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 PyUnicodeObject *v;
3817 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003818 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 PyObject *errorHandler = NULL;
3820 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003821 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003822 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 /* Default to Latin-1 */
3825 if (mapping == NULL)
3826 return PyUnicode_DecodeLatin1(s, size, errors);
3827
3828 v = _PyUnicode_New(size);
3829 if (v == NULL)
3830 goto onError;
3831 if (size == 0)
3832 return (PyObject *)v;
3833 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003835 if (PyUnicode_CheckExact(mapping)) {
3836 mapstring = PyUnicode_AS_UNICODE(mapping);
3837 maplen = PyUnicode_GET_SIZE(mapping);
3838 while (s < e) {
3839 unsigned char ch = *s;
3840 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003842 if (ch < maplen)
3843 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003845 if (x == 0xfffe) {
3846 /* undefined mapping */
3847 outpos = p-PyUnicode_AS_UNICODE(v);
3848 startinpos = s-starts;
3849 endinpos = startinpos+1;
3850 if (unicode_decode_call_errorhandler(
3851 errors, &errorHandler,
3852 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003853 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003854 (PyObject **)&v, &outpos, &p)) {
3855 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003856 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003857 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003858 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003859 *p++ = x;
3860 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003862 }
3863 else {
3864 while (s < e) {
3865 unsigned char ch = *s;
3866 PyObject *w, *x;
3867
3868 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3869 w = PyInt_FromLong((long)ch);
3870 if (w == NULL)
3871 goto onError;
3872 x = PyObject_GetItem(mapping, w);
3873 Py_DECREF(w);
3874 if (x == NULL) {
3875 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3876 /* No mapping found means: mapping is undefined. */
3877 PyErr_Clear();
3878 x = Py_None;
3879 Py_INCREF(x);
3880 } else
3881 goto onError;
3882 }
3883
3884 /* Apply mapping */
3885 if (PyInt_Check(x)) {
3886 long value = PyInt_AS_LONG(x);
3887 if (value < 0 || value > 65535) {
3888 PyErr_SetString(PyExc_TypeError,
3889 "character mapping must be in range(65536)");
3890 Py_DECREF(x);
3891 goto onError;
3892 }
3893 *p++ = (Py_UNICODE)value;
3894 }
3895 else if (x == Py_None) {
3896 /* undefined mapping */
3897 outpos = p-PyUnicode_AS_UNICODE(v);
3898 startinpos = s-starts;
3899 endinpos = startinpos+1;
3900 if (unicode_decode_call_errorhandler(
3901 errors, &errorHandler,
3902 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003903 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003904 (PyObject **)&v, &outpos, &p)) {
3905 Py_DECREF(x);
3906 goto onError;
3907 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003908 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003909 continue;
3910 }
3911 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003912 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003913
3914 if (targetsize == 1)
3915 /* 1-1 mapping */
3916 *p++ = *PyUnicode_AS_UNICODE(x);
3917
3918 else if (targetsize > 1) {
3919 /* 1-n mapping */
3920 if (targetsize > extrachars) {
3921 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3923 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003924 (targetsize << 2);
3925 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003926 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003927 if (_PyUnicode_Resize(&v,
3928 PyUnicode_GET_SIZE(v) + needed) < 0) {
3929 Py_DECREF(x);
3930 goto onError;
3931 }
3932 p = PyUnicode_AS_UNICODE(v) + oldpos;
3933 }
3934 Py_UNICODE_COPY(p,
3935 PyUnicode_AS_UNICODE(x),
3936 targetsize);
3937 p += targetsize;
3938 extrachars -= targetsize;
3939 }
3940 /* 1-0 mapping: skip the character */
3941 }
3942 else {
3943 /* wrong return value */
3944 PyErr_SetString(PyExc_TypeError,
3945 "character mapping must return integer, None or unicode");
3946 Py_DECREF(x);
3947 goto onError;
3948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003950 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 }
3953 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003954 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 Py_XDECREF(errorHandler);
3957 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003959
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 Py_XDECREF(errorHandler);
3962 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 Py_XDECREF(v);
3964 return NULL;
3965}
3966
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003967/* Charmap encoding: the lookup table */
3968
3969struct encoding_map{
3970 PyObject_HEAD
3971 unsigned char level1[32];
3972 int count2, count3;
3973 unsigned char level23[1];
3974};
3975
3976static PyObject*
3977encoding_map_size(PyObject *obj, PyObject* args)
3978{
3979 struct encoding_map *map = (struct encoding_map*)obj;
3980 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3981 128*map->count3);
3982}
3983
3984static PyMethodDef encoding_map_methods[] = {
3985 {"size", encoding_map_size, METH_NOARGS,
3986 PyDoc_STR("Return the size (in bytes) of this object") },
3987 { 0 }
3988};
3989
3990static void
3991encoding_map_dealloc(PyObject* o)
3992{
3993 PyObject_FREE(o);
3994}
3995
3996static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003997 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003998 "EncodingMap", /*tp_name*/
3999 sizeof(struct encoding_map), /*tp_basicsize*/
4000 0, /*tp_itemsize*/
4001 /* methods */
4002 encoding_map_dealloc, /*tp_dealloc*/
4003 0, /*tp_print*/
4004 0, /*tp_getattr*/
4005 0, /*tp_setattr*/
4006 0, /*tp_compare*/
4007 0, /*tp_repr*/
4008 0, /*tp_as_number*/
4009 0, /*tp_as_sequence*/
4010 0, /*tp_as_mapping*/
4011 0, /*tp_hash*/
4012 0, /*tp_call*/
4013 0, /*tp_str*/
4014 0, /*tp_getattro*/
4015 0, /*tp_setattro*/
4016 0, /*tp_as_buffer*/
4017 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4018 0, /*tp_doc*/
4019 0, /*tp_traverse*/
4020 0, /*tp_clear*/
4021 0, /*tp_richcompare*/
4022 0, /*tp_weaklistoffset*/
4023 0, /*tp_iter*/
4024 0, /*tp_iternext*/
4025 encoding_map_methods, /*tp_methods*/
4026 0, /*tp_members*/
4027 0, /*tp_getset*/
4028 0, /*tp_base*/
4029 0, /*tp_dict*/
4030 0, /*tp_descr_get*/
4031 0, /*tp_descr_set*/
4032 0, /*tp_dictoffset*/
4033 0, /*tp_init*/
4034 0, /*tp_alloc*/
4035 0, /*tp_new*/
4036 0, /*tp_free*/
4037 0, /*tp_is_gc*/
4038};
4039
4040PyObject*
4041PyUnicode_BuildEncodingMap(PyObject* string)
4042{
4043 Py_UNICODE *decode;
4044 PyObject *result;
4045 struct encoding_map *mresult;
4046 int i;
4047 int need_dict = 0;
4048 unsigned char level1[32];
4049 unsigned char level2[512];
4050 unsigned char *mlevel1, *mlevel2, *mlevel3;
4051 int count2 = 0, count3 = 0;
4052
4053 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4054 PyErr_BadArgument();
4055 return NULL;
4056 }
4057 decode = PyUnicode_AS_UNICODE(string);
4058 memset(level1, 0xFF, sizeof level1);
4059 memset(level2, 0xFF, sizeof level2);
4060
4061 /* If there isn't a one-to-one mapping of NULL to \0,
4062 or if there are non-BMP characters, we need to use
4063 a mapping dictionary. */
4064 if (decode[0] != 0)
4065 need_dict = 1;
4066 for (i = 1; i < 256; i++) {
4067 int l1, l2;
4068 if (decode[i] == 0
4069 #ifdef Py_UNICODE_WIDE
4070 || decode[i] > 0xFFFF
4071 #endif
4072 ) {
4073 need_dict = 1;
4074 break;
4075 }
4076 if (decode[i] == 0xFFFE)
4077 /* unmapped character */
4078 continue;
4079 l1 = decode[i] >> 11;
4080 l2 = decode[i] >> 7;
4081 if (level1[l1] == 0xFF)
4082 level1[l1] = count2++;
4083 if (level2[l2] == 0xFF)
4084 level2[l2] = count3++;
4085 }
4086
4087 if (count2 >= 0xFF || count3 >= 0xFF)
4088 need_dict = 1;
4089
4090 if (need_dict) {
4091 PyObject *result = PyDict_New();
4092 PyObject *key, *value;
4093 if (!result)
4094 return NULL;
4095 for (i = 0; i < 256; i++) {
4096 key = value = NULL;
4097 key = PyInt_FromLong(decode[i]);
4098 value = PyInt_FromLong(i);
4099 if (!key || !value)
4100 goto failed1;
4101 if (PyDict_SetItem(result, key, value) == -1)
4102 goto failed1;
4103 Py_DECREF(key);
4104 Py_DECREF(value);
4105 }
4106 return result;
4107 failed1:
4108 Py_XDECREF(key);
4109 Py_XDECREF(value);
4110 Py_DECREF(result);
4111 return NULL;
4112 }
4113
4114 /* Create a three-level trie */
4115 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4116 16*count2 + 128*count3 - 1);
4117 if (!result)
4118 return PyErr_NoMemory();
4119 PyObject_Init(result, &EncodingMapType);
4120 mresult = (struct encoding_map*)result;
4121 mresult->count2 = count2;
4122 mresult->count3 = count3;
4123 mlevel1 = mresult->level1;
4124 mlevel2 = mresult->level23;
4125 mlevel3 = mresult->level23 + 16*count2;
4126 memcpy(mlevel1, level1, 32);
4127 memset(mlevel2, 0xFF, 16*count2);
4128 memset(mlevel3, 0, 128*count3);
4129 count3 = 0;
4130 for (i = 1; i < 256; i++) {
4131 int o1, o2, o3, i2, i3;
4132 if (decode[i] == 0xFFFE)
4133 /* unmapped character */
4134 continue;
4135 o1 = decode[i]>>11;
4136 o2 = (decode[i]>>7) & 0xF;
4137 i2 = 16*mlevel1[o1] + o2;
4138 if (mlevel2[i2] == 0xFF)
4139 mlevel2[i2] = count3++;
4140 o3 = decode[i] & 0x7F;
4141 i3 = 128*mlevel2[i2] + o3;
4142 mlevel3[i3] = i;
4143 }
4144 return result;
4145}
4146
4147static int
4148encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4149{
4150 struct encoding_map *map = (struct encoding_map*)mapping;
4151 int l1 = c>>11;
4152 int l2 = (c>>7) & 0xF;
4153 int l3 = c & 0x7F;
4154 int i;
4155
4156#ifdef Py_UNICODE_WIDE
4157 if (c > 0xFFFF) {
4158 return -1;
4159 }
4160#endif
4161 if (c == 0)
4162 return 0;
4163 /* level 1*/
4164 i = map->level1[l1];
4165 if (i == 0xFF) {
4166 return -1;
4167 }
4168 /* level 2*/
4169 i = map->level23[16*i+l2];
4170 if (i == 0xFF) {
4171 return -1;
4172 }
4173 /* level 3 */
4174 i = map->level23[16*map->count2 + 128*i + l3];
4175 if (i == 0) {
4176 return -1;
4177 }
4178 return i;
4179}
4180
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181/* Lookup the character ch in the mapping. If the character
4182 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004183 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 PyObject *w = PyInt_FromLong((long)c);
4187 PyObject *x;
4188
4189 if (w == NULL)
4190 return NULL;
4191 x = PyObject_GetItem(mapping, w);
4192 Py_DECREF(w);
4193 if (x == NULL) {
4194 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4195 /* No mapping found means: mapping is undefined. */
4196 PyErr_Clear();
4197 x = Py_None;
4198 Py_INCREF(x);
4199 return x;
4200 } else
4201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004203 else if (x == Py_None)
4204 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 else if (PyInt_Check(x)) {
4206 long value = PyInt_AS_LONG(x);
4207 if (value < 0 || value > 255) {
4208 PyErr_SetString(PyExc_TypeError,
4209 "character mapping must be in range(256)");
4210 Py_DECREF(x);
4211 return NULL;
4212 }
4213 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 else if (PyString_Check(x))
4216 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004219 PyErr_Format(PyExc_TypeError,
4220 "character mapping must return integer, None or str8, not %.400s",
4221 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 Py_DECREF(x);
4223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 }
4225}
4226
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004227static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004228charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004229{
Walter Dörwald827b0552007-05-12 13:23:53 +00004230 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004231 /* exponentially overallocate to minimize reallocations */
4232 if (requiredsize < 2*outsize)
4233 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004234 if (PyBytes_Resize(outobj, requiredsize)) {
4235 Py_DECREF(outobj);
4236 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004237 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004238 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004239}
4240
4241typedef enum charmapencode_result {
4242 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4243}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004245 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246 space is available. Return a new reference to the object that
4247 was put in the output buffer, or Py_None, if the mapping was undefined
4248 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004249 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004251charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004252 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004254 PyObject *rep;
4255 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004256 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004258 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004259 int res = encoding_map_lookup(c, mapping);
4260 Py_ssize_t requiredsize = *outpos+1;
4261 if (res == -1)
4262 return enc_FAILED;
4263 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004264 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004265 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004266 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004267 outstart[(*outpos)++] = (char)res;
4268 return enc_SUCCESS;
4269 }
4270
4271 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004273 return enc_EXCEPTION;
4274 else if (rep==Py_None) {
4275 Py_DECREF(rep);
4276 return enc_FAILED;
4277 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004279 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004280 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004281 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004283 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004285 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4287 }
4288 else {
4289 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004290 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4291 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004292 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004293 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004295 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004297 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 memcpy(outstart + *outpos, repchars, repsize);
4299 *outpos += repsize;
4300 }
4301 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004302 Py_DECREF(rep);
4303 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304}
4305
4306/* handle an error in PyUnicode_EncodeCharmap
4307 Return 0 on success, -1 on error */
4308static
4309int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004310 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004312 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004313 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314{
4315 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004316 Py_ssize_t repsize;
4317 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318 Py_UNICODE *uni2;
4319 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004320 Py_ssize_t collstartpos = *inpos;
4321 Py_ssize_t collendpos = *inpos+1;
4322 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004323 char *encoding = "charmap";
4324 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004325 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 /* find all unencodable characters */
4328 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004329 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004330 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004331 int res = encoding_map_lookup(p[collendpos], mapping);
4332 if (res != -1)
4333 break;
4334 ++collendpos;
4335 continue;
4336 }
4337
4338 rep = charmapencode_lookup(p[collendpos], mapping);
4339 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341 else if (rep!=Py_None) {
4342 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004343 break;
4344 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004345 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 ++collendpos;
4347 }
4348 /* cache callback name lookup
4349 * (if not done yet, i.e. it's the first error) */
4350 if (*known_errorHandler==-1) {
4351 if ((errors==NULL) || (!strcmp(errors, "strict")))
4352 *known_errorHandler = 1;
4353 else if (!strcmp(errors, "replace"))
4354 *known_errorHandler = 2;
4355 else if (!strcmp(errors, "ignore"))
4356 *known_errorHandler = 3;
4357 else if (!strcmp(errors, "xmlcharrefreplace"))
4358 *known_errorHandler = 4;
4359 else
4360 *known_errorHandler = 0;
4361 }
4362 switch (*known_errorHandler) {
4363 case 1: /* strict */
4364 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4365 return -1;
4366 case 2: /* replace */
4367 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4368 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004369 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 return -1;
4371 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004372 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4374 return -1;
4375 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 }
4377 /* fall through */
4378 case 3: /* ignore */
4379 *inpos = collendpos;
4380 break;
4381 case 4: /* xmlcharrefreplace */
4382 /* generate replacement (temporarily (mis)uses p) */
4383 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4384 char buffer[2+29+1+1];
4385 char *cp;
4386 sprintf(buffer, "&#%d;", (int)p[collpos]);
4387 for (cp = buffer; *cp; ++cp) {
4388 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004389 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004391 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4393 return -1;
4394 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 }
4396 }
4397 *inpos = collendpos;
4398 break;
4399 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004400 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 encoding, reason, p, size, exceptionObject,
4402 collstartpos, collendpos, &newpos);
4403 if (repunicode == NULL)
4404 return -1;
4405 /* generate replacement */
4406 repsize = PyUnicode_GET_SIZE(repunicode);
4407 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4408 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004409 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 return -1;
4411 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004412 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4415 return -1;
4416 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 }
4418 *inpos = newpos;
4419 Py_DECREF(repunicode);
4420 }
4421 return 0;
4422}
4423
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004425 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 PyObject *mapping,
4427 const char *errors)
4428{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 /* output object */
4430 PyObject *res = NULL;
4431 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004434 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 PyObject *errorHandler = NULL;
4436 PyObject *exc = NULL;
4437 /* the following variable is used for caching string comparisons
4438 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4439 * 3=ignore, 4=xmlcharrefreplace */
4440 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441
4442 /* Default to Latin-1 */
4443 if (mapping == NULL)
4444 return PyUnicode_EncodeLatin1(p, size, errors);
4445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 /* allocate enough for a simple encoding without
4447 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004448 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 if (res == NULL)
4450 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004451 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 while (inpos<size) {
4455 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004456 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004457 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004459 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460 if (charmap_encoding_error(p, size, &inpos, mapping,
4461 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004462 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004463 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004464 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 else
4468 /* done with this character => adjust input position */
4469 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004473 if (respos<PyBytes_GET_SIZE(res)) {
4474 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 goto onError;
4476 }
4477 Py_XDECREF(exc);
4478 Py_XDECREF(errorHandler);
4479 return res;
4480
4481 onError:
4482 Py_XDECREF(res);
4483 Py_XDECREF(exc);
4484 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004485 return NULL;
4486}
4487
4488PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4489 PyObject *mapping)
4490{
4491 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4492 PyErr_BadArgument();
4493 return NULL;
4494 }
4495 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4496 PyUnicode_GET_SIZE(unicode),
4497 mapping,
4498 NULL);
4499}
4500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501/* create or adjust a UnicodeTranslateError */
4502static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 const Py_UNICODE *unicode, Py_ssize_t size,
4504 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 if (*exceptionObject == NULL) {
4508 *exceptionObject = PyUnicodeTranslateError_Create(
4509 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 }
4511 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4513 goto onError;
4514 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4515 goto onError;
4516 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4517 goto onError;
4518 return;
4519 onError:
4520 Py_DECREF(*exceptionObject);
4521 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 }
4523}
4524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525/* raises a UnicodeTranslateError */
4526static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004527 const Py_UNICODE *unicode, Py_ssize_t size,
4528 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 const char *reason)
4530{
4531 make_translate_exception(exceptionObject,
4532 unicode, size, startpos, endpos, reason);
4533 if (*exceptionObject != NULL)
4534 PyCodec_StrictErrors(*exceptionObject);
4535}
4536
4537/* error handling callback helper:
4538 build arguments, call the callback and check the arguments,
4539 put the result into newpos and return the replacement string, which
4540 has to be freed by the caller */
4541static PyObject *unicode_translate_call_errorhandler(const char *errors,
4542 PyObject **errorHandler,
4543 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004544 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4545 Py_ssize_t startpos, Py_ssize_t endpos,
4546 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004548 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004550 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 PyObject *restuple;
4552 PyObject *resunicode;
4553
4554 if (*errorHandler == NULL) {
4555 *errorHandler = PyCodec_LookupError(errors);
4556 if (*errorHandler == NULL)
4557 return NULL;
4558 }
4559
4560 make_translate_exception(exceptionObject,
4561 unicode, size, startpos, endpos, reason);
4562 if (*exceptionObject == NULL)
4563 return NULL;
4564
4565 restuple = PyObject_CallFunctionObjArgs(
4566 *errorHandler, *exceptionObject, NULL);
4567 if (restuple == NULL)
4568 return NULL;
4569 if (!PyTuple_Check(restuple)) {
4570 PyErr_Format(PyExc_TypeError, &argparse[4]);
4571 Py_DECREF(restuple);
4572 return NULL;
4573 }
4574 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004575 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_DECREF(restuple);
4577 return NULL;
4578 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004579 if (i_newpos<0)
4580 *newpos = size+i_newpos;
4581 else
4582 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004583 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004584 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004585 Py_DECREF(restuple);
4586 return NULL;
4587 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 Py_INCREF(resunicode);
4589 Py_DECREF(restuple);
4590 return resunicode;
4591}
4592
4593/* Lookup the character ch in the mapping and put the result in result,
4594 which must be decrefed by the caller.
4595 Return 0 on success, -1 on error */
4596static
4597int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4598{
4599 PyObject *w = PyInt_FromLong((long)c);
4600 PyObject *x;
4601
4602 if (w == NULL)
4603 return -1;
4604 x = PyObject_GetItem(mapping, w);
4605 Py_DECREF(w);
4606 if (x == NULL) {
4607 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4608 /* No mapping found means: use 1:1 mapping. */
4609 PyErr_Clear();
4610 *result = NULL;
4611 return 0;
4612 } else
4613 return -1;
4614 }
4615 else if (x == Py_None) {
4616 *result = x;
4617 return 0;
4618 }
4619 else if (PyInt_Check(x)) {
4620 long value = PyInt_AS_LONG(x);
4621 long max = PyUnicode_GetMax();
4622 if (value < 0 || value > max) {
4623 PyErr_Format(PyExc_TypeError,
4624 "character mapping must be in range(0x%lx)", max+1);
4625 Py_DECREF(x);
4626 return -1;
4627 }
4628 *result = x;
4629 return 0;
4630 }
4631 else if (PyUnicode_Check(x)) {
4632 *result = x;
4633 return 0;
4634 }
4635 else {
4636 /* wrong return value */
4637 PyErr_SetString(PyExc_TypeError,
4638 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004639 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 return -1;
4641 }
4642}
4643/* ensure that *outobj is at least requiredsize characters long,
4644if not reallocate and adjust various state variables.
4645Return 0 on success, -1 on error */
4646static
Walter Dörwald4894c302003-10-24 14:25:28 +00004647int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004648 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004650 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004651 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004655 if (requiredsize < 2 * oldsize)
4656 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004657 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 return -1;
4659 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 }
4661 return 0;
4662}
4663/* lookup the character, put the result in the output string and adjust
4664 various state variables. Return a new reference to the object that
4665 was put in the output buffer in *result, or Py_None, if the mapping was
4666 undefined (in which case no character was written).
4667 The called must decref result.
4668 Return 0 on success, -1 on error. */
4669static
Walter Dörwald4894c302003-10-24 14:25:28 +00004670int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004671 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004672 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673{
Walter Dörwald4894c302003-10-24 14:25:28 +00004674 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 return -1;
4676 if (*res==NULL) {
4677 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004678 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 }
4680 else if (*res==Py_None)
4681 ;
4682 else if (PyInt_Check(*res)) {
4683 /* no overflow check, because we know that the space is enough */
4684 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4685 }
4686 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688 if (repsize==1) {
4689 /* no overflow check, because we know that the space is enough */
4690 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4691 }
4692 else if (repsize!=0) {
4693 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004694 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004695 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004696 repsize - 1;
4697 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 return -1;
4699 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4700 *outp += repsize;
4701 }
4702 }
4703 else
4704 return -1;
4705 return 0;
4706}
4707
4708PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004709 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 PyObject *mapping,
4711 const char *errors)
4712{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004713 /* output object */
4714 PyObject *res = NULL;
4715 /* pointers to the beginning and end+1 of input */
4716 const Py_UNICODE *startp = p;
4717 const Py_UNICODE *endp = p + size;
4718 /* pointer into the output */
4719 Py_UNICODE *str;
4720 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004721 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 char *reason = "character maps to <undefined>";
4723 PyObject *errorHandler = NULL;
4724 PyObject *exc = NULL;
4725 /* the following variable is used for caching string comparisons
4726 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4727 * 3=ignore, 4=xmlcharrefreplace */
4728 int known_errorHandler = -1;
4729
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 if (mapping == NULL) {
4731 PyErr_BadArgument();
4732 return NULL;
4733 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734
4735 /* allocate enough for a simple 1:1 translation without
4736 replacements, if we need more, we'll resize */
4737 res = PyUnicode_FromUnicode(NULL, size);
4738 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004739 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 return res;
4742 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744 while (p<endp) {
4745 /* try to encode it */
4746 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004747 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 goto onError;
4750 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004751 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 if (x!=Py_None) /* it worked => adjust input pointer */
4753 ++p;
4754 else { /* untranslatable character */
4755 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004756 Py_ssize_t repsize;
4757 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 Py_UNICODE *uni2;
4759 /* startpos for collecting untranslatable chars */
4760 const Py_UNICODE *collstart = p;
4761 const Py_UNICODE *collend = p+1;
4762 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764 /* find all untranslatable characters */
4765 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004766 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 goto onError;
4768 Py_XDECREF(x);
4769 if (x!=Py_None)
4770 break;
4771 ++collend;
4772 }
4773 /* cache callback name lookup
4774 * (if not done yet, i.e. it's the first error) */
4775 if (known_errorHandler==-1) {
4776 if ((errors==NULL) || (!strcmp(errors, "strict")))
4777 known_errorHandler = 1;
4778 else if (!strcmp(errors, "replace"))
4779 known_errorHandler = 2;
4780 else if (!strcmp(errors, "ignore"))
4781 known_errorHandler = 3;
4782 else if (!strcmp(errors, "xmlcharrefreplace"))
4783 known_errorHandler = 4;
4784 else
4785 known_errorHandler = 0;
4786 }
4787 switch (known_errorHandler) {
4788 case 1: /* strict */
4789 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4790 goto onError;
4791 case 2: /* replace */
4792 /* No need to check for space, this is a 1:1 replacement */
4793 for (coll = collstart; coll<collend; ++coll)
4794 *str++ = '?';
4795 /* fall through */
4796 case 3: /* ignore */
4797 p = collend;
4798 break;
4799 case 4: /* xmlcharrefreplace */
4800 /* generate replacement (temporarily (mis)uses p) */
4801 for (p = collstart; p < collend; ++p) {
4802 char buffer[2+29+1+1];
4803 char *cp;
4804 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004805 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4807 goto onError;
4808 for (cp = buffer; *cp; ++cp)
4809 *str++ = *cp;
4810 }
4811 p = collend;
4812 break;
4813 default:
4814 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4815 reason, startp, size, &exc,
4816 collstart-startp, collend-startp, &newpos);
4817 if (repunicode == NULL)
4818 goto onError;
4819 /* generate replacement */
4820 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004821 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4823 Py_DECREF(repunicode);
4824 goto onError;
4825 }
4826 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4827 *str++ = *uni2;
4828 p = startp + newpos;
4829 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 }
4831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 /* Resize if we allocated to much */
4834 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004835 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004836 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004837 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004838 }
4839 Py_XDECREF(exc);
4840 Py_XDECREF(errorHandler);
4841 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 onError:
4844 Py_XDECREF(res);
4845 Py_XDECREF(exc);
4846 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 return NULL;
4848}
4849
4850PyObject *PyUnicode_Translate(PyObject *str,
4851 PyObject *mapping,
4852 const char *errors)
4853{
4854 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004855
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 str = PyUnicode_FromObject(str);
4857 if (str == NULL)
4858 goto onError;
4859 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4860 PyUnicode_GET_SIZE(str),
4861 mapping,
4862 errors);
4863 Py_DECREF(str);
4864 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004865
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 onError:
4867 Py_XDECREF(str);
4868 return NULL;
4869}
Tim Petersced69f82003-09-16 20:30:58 +00004870
Guido van Rossum9e896b32000-04-05 20:11:21 +00004871/* --- Decimal Encoder ---------------------------------------------------- */
4872
4873int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004874 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004875 char *output,
4876 const char *errors)
4877{
4878 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 PyObject *errorHandler = NULL;
4880 PyObject *exc = NULL;
4881 const char *encoding = "decimal";
4882 const char *reason = "invalid decimal Unicode string";
4883 /* the following variable is used for caching string comparisons
4884 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4885 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004886
4887 if (output == NULL) {
4888 PyErr_BadArgument();
4889 return -1;
4890 }
4891
4892 p = s;
4893 end = s + length;
4894 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004896 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004898 Py_ssize_t repsize;
4899 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 Py_UNICODE *uni2;
4901 Py_UNICODE *collstart;
4902 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004903
Guido van Rossum9e896b32000-04-05 20:11:21 +00004904 if (Py_UNICODE_ISSPACE(ch)) {
4905 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004906 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004907 continue;
4908 }
4909 decimal = Py_UNICODE_TODECIMAL(ch);
4910 if (decimal >= 0) {
4911 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004912 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004913 continue;
4914 }
Guido van Rossumba477042000-04-06 18:18:10 +00004915 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004916 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004917 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004918 continue;
4919 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 /* All other characters are considered unencodable */
4921 collstart = p;
4922 collend = p+1;
4923 while (collend < end) {
4924 if ((0 < *collend && *collend < 256) ||
4925 !Py_UNICODE_ISSPACE(*collend) ||
4926 Py_UNICODE_TODECIMAL(*collend))
4927 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004928 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 /* cache callback name lookup
4930 * (if not done yet, i.e. it's the first error) */
4931 if (known_errorHandler==-1) {
4932 if ((errors==NULL) || (!strcmp(errors, "strict")))
4933 known_errorHandler = 1;
4934 else if (!strcmp(errors, "replace"))
4935 known_errorHandler = 2;
4936 else if (!strcmp(errors, "ignore"))
4937 known_errorHandler = 3;
4938 else if (!strcmp(errors, "xmlcharrefreplace"))
4939 known_errorHandler = 4;
4940 else
4941 known_errorHandler = 0;
4942 }
4943 switch (known_errorHandler) {
4944 case 1: /* strict */
4945 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4946 goto onError;
4947 case 2: /* replace */
4948 for (p = collstart; p < collend; ++p)
4949 *output++ = '?';
4950 /* fall through */
4951 case 3: /* ignore */
4952 p = collend;
4953 break;
4954 case 4: /* xmlcharrefreplace */
4955 /* generate replacement (temporarily (mis)uses p) */
4956 for (p = collstart; p < collend; ++p)
4957 output += sprintf(output, "&#%d;", (int)*p);
4958 p = collend;
4959 break;
4960 default:
4961 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4962 encoding, reason, s, length, &exc,
4963 collstart-s, collend-s, &newpos);
4964 if (repunicode == NULL)
4965 goto onError;
4966 /* generate replacement */
4967 repsize = PyUnicode_GET_SIZE(repunicode);
4968 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4969 Py_UNICODE ch = *uni2;
4970 if (Py_UNICODE_ISSPACE(ch))
4971 *output++ = ' ';
4972 else {
4973 decimal = Py_UNICODE_TODECIMAL(ch);
4974 if (decimal >= 0)
4975 *output++ = '0' + decimal;
4976 else if (0 < ch && ch < 256)
4977 *output++ = (char)ch;
4978 else {
4979 Py_DECREF(repunicode);
4980 raise_encode_exception(&exc, encoding,
4981 s, length, collstart-s, collend-s, reason);
4982 goto onError;
4983 }
4984 }
4985 }
4986 p = s + newpos;
4987 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004988 }
4989 }
4990 /* 0-terminate the output string */
4991 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 Py_XDECREF(exc);
4993 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004994 return 0;
4995
4996 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004997 Py_XDECREF(exc);
4998 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004999 return -1;
5000}
5001
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002/* --- Helpers ------------------------------------------------------------ */
5003
Eric Smith8c663262007-08-25 02:26:07 +00005004#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005005
5006#include "stringlib/fastsearch.h"
5007
5008#include "stringlib/count.h"
5009#include "stringlib/find.h"
5010#include "stringlib/partition.h"
5011
5012/* helper macro to fixup start/end slice values */
5013#define FIX_START_END(obj) \
5014 if (start < 0) \
5015 start += (obj)->length; \
5016 if (start < 0) \
5017 start = 0; \
5018 if (end > (obj)->length) \
5019 end = (obj)->length; \
5020 if (end < 0) \
5021 end += (obj)->length; \
5022 if (end < 0) \
5023 end = 0;
5024
Martin v. Löwis18e16552006-02-15 17:27:45 +00005025Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005026 PyObject *substr,
5027 Py_ssize_t start,
5028 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005030 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005031 PyUnicodeObject* str_obj;
5032 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005033
Thomas Wouters477c8d52006-05-27 19:21:47 +00005034 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5035 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005037 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5038 if (!sub_obj) {
5039 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 return -1;
5041 }
Tim Petersced69f82003-09-16 20:30:58 +00005042
Thomas Wouters477c8d52006-05-27 19:21:47 +00005043 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005044
Thomas Wouters477c8d52006-05-27 19:21:47 +00005045 result = stringlib_count(
5046 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5047 );
5048
5049 Py_DECREF(sub_obj);
5050 Py_DECREF(str_obj);
5051
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 return result;
5053}
5054
Martin v. Löwis18e16552006-02-15 17:27:45 +00005055Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005056 PyObject *sub,
5057 Py_ssize_t start,
5058 Py_ssize_t end,
5059 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005061 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005062
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005064 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005065 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005066 sub = PyUnicode_FromObject(sub);
5067 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005068 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005069 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 }
Tim Petersced69f82003-09-16 20:30:58 +00005071
Thomas Wouters477c8d52006-05-27 19:21:47 +00005072 if (direction > 0)
5073 result = stringlib_find_slice(
5074 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5075 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5076 start, end
5077 );
5078 else
5079 result = stringlib_rfind_slice(
5080 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5081 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5082 start, end
5083 );
5084
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005086 Py_DECREF(sub);
5087
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 return result;
5089}
5090
Tim Petersced69f82003-09-16 20:30:58 +00005091static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092int tailmatch(PyUnicodeObject *self,
5093 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005094 Py_ssize_t start,
5095 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 int direction)
5097{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 if (substring->length == 0)
5099 return 1;
5100
Thomas Wouters477c8d52006-05-27 19:21:47 +00005101 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102
5103 end -= substring->length;
5104 if (end < start)
5105 return 0;
5106
5107 if (direction > 0) {
5108 if (Py_UNICODE_MATCH(self, end, substring))
5109 return 1;
5110 } else {
5111 if (Py_UNICODE_MATCH(self, start, substring))
5112 return 1;
5113 }
5114
5115 return 0;
5116}
5117
Martin v. Löwis18e16552006-02-15 17:27:45 +00005118Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005120 Py_ssize_t start,
5121 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 int direction)
5123{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005124 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005125
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126 str = PyUnicode_FromObject(str);
5127 if (str == NULL)
5128 return -1;
5129 substr = PyUnicode_FromObject(substr);
5130 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005131 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 return -1;
5133 }
Tim Petersced69f82003-09-16 20:30:58 +00005134
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 result = tailmatch((PyUnicodeObject *)str,
5136 (PyUnicodeObject *)substr,
5137 start, end, direction);
5138 Py_DECREF(str);
5139 Py_DECREF(substr);
5140 return result;
5141}
5142
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143/* Apply fixfct filter to the Unicode object self and return a
5144 reference to the modified object */
5145
Tim Petersced69f82003-09-16 20:30:58 +00005146static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147PyObject *fixup(PyUnicodeObject *self,
5148 int (*fixfct)(PyUnicodeObject *s))
5149{
5150
5151 PyUnicodeObject *u;
5152
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005153 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 if (u == NULL)
5155 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005156
5157 Py_UNICODE_COPY(u->str, self->str, self->length);
5158
Tim Peters7a29bd52001-09-12 03:03:31 +00005159 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 /* fixfct should return TRUE if it modified the buffer. If
5161 FALSE, return a reference to the original buffer instead
5162 (to save space, not time) */
5163 Py_INCREF(self);
5164 Py_DECREF(u);
5165 return (PyObject*) self;
5166 }
5167 return (PyObject*) u;
5168}
5169
Tim Petersced69f82003-09-16 20:30:58 +00005170static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171int fixupper(PyUnicodeObject *self)
5172{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005173 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 Py_UNICODE *s = self->str;
5175 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005176
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 while (len-- > 0) {
5178 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005179
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 ch = Py_UNICODE_TOUPPER(*s);
5181 if (ch != *s) {
5182 status = 1;
5183 *s = ch;
5184 }
5185 s++;
5186 }
5187
5188 return status;
5189}
5190
Tim Petersced69f82003-09-16 20:30:58 +00005191static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192int fixlower(PyUnicodeObject *self)
5193{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005194 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 Py_UNICODE *s = self->str;
5196 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005197
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 while (len-- > 0) {
5199 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005200
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 ch = Py_UNICODE_TOLOWER(*s);
5202 if (ch != *s) {
5203 status = 1;
5204 *s = ch;
5205 }
5206 s++;
5207 }
5208
5209 return status;
5210}
5211
Tim Petersced69f82003-09-16 20:30:58 +00005212static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213int fixswapcase(PyUnicodeObject *self)
5214{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005215 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 Py_UNICODE *s = self->str;
5217 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005218
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 while (len-- > 0) {
5220 if (Py_UNICODE_ISUPPER(*s)) {
5221 *s = Py_UNICODE_TOLOWER(*s);
5222 status = 1;
5223 } else if (Py_UNICODE_ISLOWER(*s)) {
5224 *s = Py_UNICODE_TOUPPER(*s);
5225 status = 1;
5226 }
5227 s++;
5228 }
5229
5230 return status;
5231}
5232
Tim Petersced69f82003-09-16 20:30:58 +00005233static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234int fixcapitalize(PyUnicodeObject *self)
5235{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005236 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005237 Py_UNICODE *s = self->str;
5238 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005239
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005240 if (len == 0)
5241 return 0;
5242 if (Py_UNICODE_ISLOWER(*s)) {
5243 *s = Py_UNICODE_TOUPPER(*s);
5244 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005246 s++;
5247 while (--len > 0) {
5248 if (Py_UNICODE_ISUPPER(*s)) {
5249 *s = Py_UNICODE_TOLOWER(*s);
5250 status = 1;
5251 }
5252 s++;
5253 }
5254 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255}
5256
5257static
5258int fixtitle(PyUnicodeObject *self)
5259{
5260 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5261 register Py_UNICODE *e;
5262 int previous_is_cased;
5263
5264 /* Shortcut for single character strings */
5265 if (PyUnicode_GET_SIZE(self) == 1) {
5266 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5267 if (*p != ch) {
5268 *p = ch;
5269 return 1;
5270 }
5271 else
5272 return 0;
5273 }
Tim Petersced69f82003-09-16 20:30:58 +00005274
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 e = p + PyUnicode_GET_SIZE(self);
5276 previous_is_cased = 0;
5277 for (; p < e; p++) {
5278 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 if (previous_is_cased)
5281 *p = Py_UNICODE_TOLOWER(ch);
5282 else
5283 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005284
5285 if (Py_UNICODE_ISLOWER(ch) ||
5286 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 Py_UNICODE_ISTITLE(ch))
5288 previous_is_cased = 1;
5289 else
5290 previous_is_cased = 0;
5291 }
5292 return 1;
5293}
5294
Tim Peters8ce9f162004-08-27 01:49:32 +00005295PyObject *
5296PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297{
Tim Peters8ce9f162004-08-27 01:49:32 +00005298 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005299 const Py_UNICODE blank = ' ';
5300 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005301 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005302 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005303 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5304 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005305 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5306 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005307 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005308 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005309 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
Tim Peters05eba1f2004-08-27 21:32:02 +00005311 fseq = PySequence_Fast(seq, "");
5312 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005313 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005314 }
5315
Tim Peters91879ab2004-08-27 22:35:44 +00005316 /* Grrrr. A codec may be invoked to convert str objects to
5317 * Unicode, and so it's possible to call back into Python code
5318 * during PyUnicode_FromObject(), and so it's possible for a sick
5319 * codec to change the size of fseq (if seq is a list). Therefore
5320 * we have to keep refetching the size -- can't assume seqlen
5321 * is invariant.
5322 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005323 seqlen = PySequence_Fast_GET_SIZE(fseq);
5324 /* If empty sequence, return u"". */
5325 if (seqlen == 0) {
5326 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5327 goto Done;
5328 }
5329 /* If singleton sequence with an exact Unicode, return that. */
5330 if (seqlen == 1) {
5331 item = PySequence_Fast_GET_ITEM(fseq, 0);
5332 if (PyUnicode_CheckExact(item)) {
5333 Py_INCREF(item);
5334 res = (PyUnicodeObject *)item;
5335 goto Done;
5336 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005337 }
5338
Tim Peters05eba1f2004-08-27 21:32:02 +00005339 /* At least two items to join, or one that isn't exact Unicode. */
5340 if (seqlen > 1) {
5341 /* Set up sep and seplen -- they're needed. */
5342 if (separator == NULL) {
5343 sep = &blank;
5344 seplen = 1;
5345 }
5346 else {
5347 internal_separator = PyUnicode_FromObject(separator);
5348 if (internal_separator == NULL)
5349 goto onError;
5350 sep = PyUnicode_AS_UNICODE(internal_separator);
5351 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005352 /* In case PyUnicode_FromObject() mutated seq. */
5353 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005354 }
5355 }
5356
5357 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005358 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005359 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005360 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005361 res_p = PyUnicode_AS_UNICODE(res);
5362 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005363
Tim Peters05eba1f2004-08-27 21:32:02 +00005364 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005365 Py_ssize_t itemlen;
5366 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005367
5368 item = PySequence_Fast_GET_ITEM(fseq, i);
5369 /* Convert item to Unicode. */
5370 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5371 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005372 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005373 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005374 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005375 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005376 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005377 item = PyUnicode_FromObject(item);
5378 if (item == NULL)
5379 goto onError;
5380 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005381
Tim Peters91879ab2004-08-27 22:35:44 +00005382 /* In case PyUnicode_FromObject() mutated seq. */
5383 seqlen = PySequence_Fast_GET_SIZE(fseq);
5384
Tim Peters8ce9f162004-08-27 01:49:32 +00005385 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005387 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005388 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005389 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005390 if (i < seqlen - 1) {
5391 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005392 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005393 goto Overflow;
5394 }
5395 if (new_res_used > res_alloc) {
5396 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005397 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005398 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005399 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005400 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005401 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005402 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005403 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005405 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005406 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005408
5409 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005410 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005411 res_p += itemlen;
5412 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005413 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005414 res_p += seplen;
5415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005417 res_used = new_res_used;
5418 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005419
Tim Peters05eba1f2004-08-27 21:32:02 +00005420 /* Shrink res to match the used area; this probably can't fail,
5421 * but it's cheap to check.
5422 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005423 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005424 goto onError;
5425
5426 Done:
5427 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005428 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 return (PyObject *)res;
5430
Tim Peters8ce9f162004-08-27 01:49:32 +00005431 Overflow:
5432 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005433 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005434 Py_DECREF(item);
5435 /* fall through */
5436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005438 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005439 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005440 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 return NULL;
5442}
5443
Tim Petersced69f82003-09-16 20:30:58 +00005444static
5445PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005446 Py_ssize_t left,
5447 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 Py_UNICODE fill)
5449{
5450 PyUnicodeObject *u;
5451
5452 if (left < 0)
5453 left = 0;
5454 if (right < 0)
5455 right = 0;
5456
Tim Peters7a29bd52001-09-12 03:03:31 +00005457 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 Py_INCREF(self);
5459 return self;
5460 }
5461
5462 u = _PyUnicode_New(left + self->length + right);
5463 if (u) {
5464 if (left)
5465 Py_UNICODE_FILL(u->str, fill, left);
5466 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5467 if (right)
5468 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5469 }
5470
5471 return u;
5472}
5473
5474#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005475 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 if (!str) \
5477 goto onError; \
5478 if (PyList_Append(list, str)) { \
5479 Py_DECREF(str); \
5480 goto onError; \
5481 } \
5482 else \
5483 Py_DECREF(str);
5484
5485static
5486PyObject *split_whitespace(PyUnicodeObject *self,
5487 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005488 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005490 register Py_ssize_t i;
5491 register Py_ssize_t j;
5492 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 PyObject *str;
5494
5495 for (i = j = 0; i < len; ) {
5496 /* find a token */
5497 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5498 i++;
5499 j = i;
5500 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5501 i++;
5502 if (j < i) {
5503 if (maxcount-- <= 0)
5504 break;
5505 SPLIT_APPEND(self->str, j, i);
5506 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5507 i++;
5508 j = i;
5509 }
5510 }
5511 if (j < len) {
5512 SPLIT_APPEND(self->str, j, len);
5513 }
5514 return list;
5515
5516 onError:
5517 Py_DECREF(list);
5518 return NULL;
5519}
5520
5521PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005522 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005524 register Py_ssize_t i;
5525 register Py_ssize_t j;
5526 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 PyObject *list;
5528 PyObject *str;
5529 Py_UNICODE *data;
5530
5531 string = PyUnicode_FromObject(string);
5532 if (string == NULL)
5533 return NULL;
5534 data = PyUnicode_AS_UNICODE(string);
5535 len = PyUnicode_GET_SIZE(string);
5536
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 list = PyList_New(0);
5538 if (!list)
5539 goto onError;
5540
5541 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005542 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005543
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005545 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547
5548 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005549 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 if (i < len) {
5551 if (data[i] == '\r' && i + 1 < len &&
5552 data[i+1] == '\n')
5553 i += 2;
5554 else
5555 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005556 if (keepends)
5557 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 }
Guido van Rossum86662912000-04-11 15:38:46 +00005559 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 j = i;
5561 }
5562 if (j < len) {
5563 SPLIT_APPEND(data, j, len);
5564 }
5565
5566 Py_DECREF(string);
5567 return list;
5568
5569 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005570 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 Py_DECREF(string);
5572 return NULL;
5573}
5574
Tim Petersced69f82003-09-16 20:30:58 +00005575static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576PyObject *split_char(PyUnicodeObject *self,
5577 PyObject *list,
5578 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005579 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005581 register Py_ssize_t i;
5582 register Py_ssize_t j;
5583 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 PyObject *str;
5585
5586 for (i = j = 0; i < len; ) {
5587 if (self->str[i] == ch) {
5588 if (maxcount-- <= 0)
5589 break;
5590 SPLIT_APPEND(self->str, j, i);
5591 i = j = i + 1;
5592 } else
5593 i++;
5594 }
5595 if (j <= len) {
5596 SPLIT_APPEND(self->str, j, len);
5597 }
5598 return list;
5599
5600 onError:
5601 Py_DECREF(list);
5602 return NULL;
5603}
5604
Tim Petersced69f82003-09-16 20:30:58 +00005605static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606PyObject *split_substring(PyUnicodeObject *self,
5607 PyObject *list,
5608 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005609 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005611 register Py_ssize_t i;
5612 register Py_ssize_t j;
5613 Py_ssize_t len = self->length;
5614 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 PyObject *str;
5616
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005617 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 if (Py_UNICODE_MATCH(self, i, substring)) {
5619 if (maxcount-- <= 0)
5620 break;
5621 SPLIT_APPEND(self->str, j, i);
5622 i = j = i + sublen;
5623 } else
5624 i++;
5625 }
5626 if (j <= len) {
5627 SPLIT_APPEND(self->str, j, len);
5628 }
5629 return list;
5630
5631 onError:
5632 Py_DECREF(list);
5633 return NULL;
5634}
5635
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005636static
5637PyObject *rsplit_whitespace(PyUnicodeObject *self,
5638 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005639 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005640{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005641 register Py_ssize_t i;
5642 register Py_ssize_t j;
5643 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005644 PyObject *str;
5645
5646 for (i = j = len - 1; i >= 0; ) {
5647 /* find a token */
5648 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5649 i--;
5650 j = i;
5651 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5652 i--;
5653 if (j > i) {
5654 if (maxcount-- <= 0)
5655 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005656 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005657 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5658 i--;
5659 j = i;
5660 }
5661 }
5662 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005663 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005664 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005665 if (PyList_Reverse(list) < 0)
5666 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005667 return list;
5668
5669 onError:
5670 Py_DECREF(list);
5671 return NULL;
5672}
5673
5674static
5675PyObject *rsplit_char(PyUnicodeObject *self,
5676 PyObject *list,
5677 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005678 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005679{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005680 register Py_ssize_t i;
5681 register Py_ssize_t j;
5682 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005683 PyObject *str;
5684
5685 for (i = j = len - 1; i >= 0; ) {
5686 if (self->str[i] == ch) {
5687 if (maxcount-- <= 0)
5688 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005689 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005690 j = i = i - 1;
5691 } else
5692 i--;
5693 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005694 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005695 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005696 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005697 if (PyList_Reverse(list) < 0)
5698 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005699 return list;
5700
5701 onError:
5702 Py_DECREF(list);
5703 return NULL;
5704}
5705
5706static
5707PyObject *rsplit_substring(PyUnicodeObject *self,
5708 PyObject *list,
5709 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005710 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005711{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005712 register Py_ssize_t i;
5713 register Py_ssize_t j;
5714 Py_ssize_t len = self->length;
5715 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005716 PyObject *str;
5717
5718 for (i = len - sublen, j = len; i >= 0; ) {
5719 if (Py_UNICODE_MATCH(self, i, substring)) {
5720 if (maxcount-- <= 0)
5721 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005722 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005723 j = i;
5724 i -= sublen;
5725 } else
5726 i--;
5727 }
5728 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005729 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005730 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005731 if (PyList_Reverse(list) < 0)
5732 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005733 return list;
5734
5735 onError:
5736 Py_DECREF(list);
5737 return NULL;
5738}
5739
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740#undef SPLIT_APPEND
5741
5742static
5743PyObject *split(PyUnicodeObject *self,
5744 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005745 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746{
5747 PyObject *list;
5748
5749 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005750 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
5752 list = PyList_New(0);
5753 if (!list)
5754 return NULL;
5755
5756 if (substring == NULL)
5757 return split_whitespace(self,list,maxcount);
5758
5759 else if (substring->length == 1)
5760 return split_char(self,list,substring->str[0],maxcount);
5761
5762 else if (substring->length == 0) {
5763 Py_DECREF(list);
5764 PyErr_SetString(PyExc_ValueError, "empty separator");
5765 return NULL;
5766 }
5767 else
5768 return split_substring(self,list,substring,maxcount);
5769}
5770
Tim Petersced69f82003-09-16 20:30:58 +00005771static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005772PyObject *rsplit(PyUnicodeObject *self,
5773 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005774 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775{
5776 PyObject *list;
5777
5778 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005779 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005780
5781 list = PyList_New(0);
5782 if (!list)
5783 return NULL;
5784
5785 if (substring == NULL)
5786 return rsplit_whitespace(self,list,maxcount);
5787
5788 else if (substring->length == 1)
5789 return rsplit_char(self,list,substring->str[0],maxcount);
5790
5791 else if (substring->length == 0) {
5792 Py_DECREF(list);
5793 PyErr_SetString(PyExc_ValueError, "empty separator");
5794 return NULL;
5795 }
5796 else
5797 return rsplit_substring(self,list,substring,maxcount);
5798}
5799
5800static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801PyObject *replace(PyUnicodeObject *self,
5802 PyUnicodeObject *str1,
5803 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005804 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805{
5806 PyUnicodeObject *u;
5807
5808 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005809 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
Thomas Wouters477c8d52006-05-27 19:21:47 +00005811 if (str1->length == str2->length) {
5812 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005813 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005814 if (str1->length == 1) {
5815 /* replace characters */
5816 Py_UNICODE u1, u2;
5817 if (!findchar(self->str, self->length, str1->str[0]))
5818 goto nothing;
5819 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5820 if (!u)
5821 return NULL;
5822 Py_UNICODE_COPY(u->str, self->str, self->length);
5823 u1 = str1->str[0];
5824 u2 = str2->str[0];
5825 for (i = 0; i < u->length; i++)
5826 if (u->str[i] == u1) {
5827 if (--maxcount < 0)
5828 break;
5829 u->str[i] = u2;
5830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005832 i = fastsearch(
5833 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005835 if (i < 0)
5836 goto nothing;
5837 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5838 if (!u)
5839 return NULL;
5840 Py_UNICODE_COPY(u->str, self->str, self->length);
5841 while (i <= self->length - str1->length)
5842 if (Py_UNICODE_MATCH(self, i, str1)) {
5843 if (--maxcount < 0)
5844 break;
5845 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5846 i += str1->length;
5847 } else
5848 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005851
5852 Py_ssize_t n, i, j, e;
5853 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 Py_UNICODE *p;
5855
5856 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005857 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 if (n > maxcount)
5859 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005860 if (n == 0)
5861 goto nothing;
5862 /* new_size = self->length + n * (str2->length - str1->length)); */
5863 delta = (str2->length - str1->length);
5864 if (delta == 0) {
5865 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005867 product = n * (str2->length - str1->length);
5868 if ((product / (str2->length - str1->length)) != n) {
5869 PyErr_SetString(PyExc_OverflowError,
5870 "replace string is too long");
5871 return NULL;
5872 }
5873 new_size = self->length + product;
5874 if (new_size < 0) {
5875 PyErr_SetString(PyExc_OverflowError,
5876 "replace string is too long");
5877 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 }
5879 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005880 u = _PyUnicode_New(new_size);
5881 if (!u)
5882 return NULL;
5883 i = 0;
5884 p = u->str;
5885 e = self->length - str1->length;
5886 if (str1->length > 0) {
5887 while (n-- > 0) {
5888 /* look for next match */
5889 j = i;
5890 while (j <= e) {
5891 if (Py_UNICODE_MATCH(self, j, str1))
5892 break;
5893 j++;
5894 }
5895 if (j > i) {
5896 if (j > e)
5897 break;
5898 /* copy unchanged part [i:j] */
5899 Py_UNICODE_COPY(p, self->str+i, j-i);
5900 p += j - i;
5901 }
5902 /* copy substitution string */
5903 if (str2->length > 0) {
5904 Py_UNICODE_COPY(p, str2->str, str2->length);
5905 p += str2->length;
5906 }
5907 i = j + str1->length;
5908 }
5909 if (i < self->length)
5910 /* copy tail [i:] */
5911 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5912 } else {
5913 /* interleave */
5914 while (n > 0) {
5915 Py_UNICODE_COPY(p, str2->str, str2->length);
5916 p += str2->length;
5917 if (--n <= 0)
5918 break;
5919 *p++ = self->str[i++];
5920 }
5921 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5922 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005925
5926nothing:
5927 /* nothing to replace; return original string (when possible) */
5928 if (PyUnicode_CheckExact(self)) {
5929 Py_INCREF(self);
5930 return (PyObject *) self;
5931 }
5932 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933}
5934
5935/* --- Unicode Object Methods --------------------------------------------- */
5936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005937PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938"S.title() -> unicode\n\
5939\n\
5940Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005941characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942
5943static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005944unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 return fixup(self, fixtitle);
5947}
5948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005949PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950"S.capitalize() -> unicode\n\
5951\n\
5952Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005953have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
5955static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005956unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 return fixup(self, fixcapitalize);
5959}
5960
5961#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005962PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963"S.capwords() -> unicode\n\
5964\n\
5965Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005966normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
5968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005969unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970{
5971 PyObject *list;
5972 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005973 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 /* Split into words */
5976 list = split(self, NULL, -1);
5977 if (!list)
5978 return NULL;
5979
5980 /* Capitalize each word */
5981 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5982 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5983 fixcapitalize);
5984 if (item == NULL)
5985 goto onError;
5986 Py_DECREF(PyList_GET_ITEM(list, i));
5987 PyList_SET_ITEM(list, i, item);
5988 }
5989
5990 /* Join the words to form a new string */
5991 item = PyUnicode_Join(NULL, list);
5992
5993onError:
5994 Py_DECREF(list);
5995 return (PyObject *)item;
5996}
5997#endif
5998
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005999/* Argument converter. Coerces to a single unicode character */
6000
6001static int
6002convert_uc(PyObject *obj, void *addr)
6003{
6004 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6005 PyObject *uniobj;
6006 Py_UNICODE *unistr;
6007
6008 uniobj = PyUnicode_FromObject(obj);
6009 if (uniobj == NULL) {
6010 PyErr_SetString(PyExc_TypeError,
6011 "The fill character cannot be converted to Unicode");
6012 return 0;
6013 }
6014 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6015 PyErr_SetString(PyExc_TypeError,
6016 "The fill character must be exactly one character long");
6017 Py_DECREF(uniobj);
6018 return 0;
6019 }
6020 unistr = PyUnicode_AS_UNICODE(uniobj);
6021 *fillcharloc = unistr[0];
6022 Py_DECREF(uniobj);
6023 return 1;
6024}
6025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006026PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006027"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006029Return S centered in a Unicode string of length width. Padding is\n\
6030done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
6032static PyObject *
6033unicode_center(PyUnicodeObject *self, PyObject *args)
6034{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006035 Py_ssize_t marg, left;
6036 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006037 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
Thomas Woutersde017742006-02-16 19:34:37 +00006039 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 return NULL;
6041
Tim Peters7a29bd52001-09-12 03:03:31 +00006042 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 Py_INCREF(self);
6044 return (PyObject*) self;
6045 }
6046
6047 marg = width - self->length;
6048 left = marg / 2 + (marg & width & 1);
6049
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006050 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051}
6052
Marc-André Lemburge5034372000-08-08 08:04:29 +00006053#if 0
6054
6055/* This code should go into some future Unicode collation support
6056 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006057 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006058
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006059/* speedy UTF-16 code point order comparison */
6060/* gleaned from: */
6061/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6062
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006063static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006064{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006065 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006066 0, 0, 0, 0, 0, 0, 0, 0,
6067 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006068 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006069};
6070
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071static int
6072unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6073{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006074 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006075
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 Py_UNICODE *s1 = str1->str;
6077 Py_UNICODE *s2 = str2->str;
6078
6079 len1 = str1->length;
6080 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006081
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006083 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006084
6085 c1 = *s1++;
6086 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006087
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006088 if (c1 > (1<<11) * 26)
6089 c1 += utf16Fixup[c1>>11];
6090 if (c2 > (1<<11) * 26)
6091 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006092 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006093
6094 if (c1 != c2)
6095 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006096
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006097 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 }
6099
6100 return (len1 < len2) ? -1 : (len1 != len2);
6101}
6102
Marc-André Lemburge5034372000-08-08 08:04:29 +00006103#else
6104
6105static int
6106unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6107{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006108 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006109
6110 Py_UNICODE *s1 = str1->str;
6111 Py_UNICODE *s2 = str2->str;
6112
6113 len1 = str1->length;
6114 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006115
Marc-André Lemburge5034372000-08-08 08:04:29 +00006116 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006117 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006118
Fredrik Lundh45714e92001-06-26 16:39:36 +00006119 c1 = *s1++;
6120 c2 = *s2++;
6121
6122 if (c1 != c2)
6123 return (c1 < c2) ? -1 : 1;
6124
Marc-André Lemburge5034372000-08-08 08:04:29 +00006125 len1--; len2--;
6126 }
6127
6128 return (len1 < len2) ? -1 : (len1 != len2);
6129}
6130
6131#endif
6132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133int PyUnicode_Compare(PyObject *left,
6134 PyObject *right)
6135{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006136 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6137 return unicode_compare((PyUnicodeObject *)left,
6138 (PyUnicodeObject *)right);
6139 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6140 (PyUnicode_Check(left) && PyString_Check(right))) {
6141 if (PyUnicode_Check(left))
6142 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6143 if (PyUnicode_Check(right))
6144 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6145 assert(PyString_Check(left));
6146 assert(PyString_Check(right));
6147 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006149 PyErr_Format(PyExc_TypeError,
6150 "Can't compare %.100s and %.100s",
6151 left->ob_type->tp_name,
6152 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 return -1;
6154}
6155
Martin v. Löwis5b222132007-06-10 09:51:05 +00006156int
6157PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6158{
6159 int i;
6160 Py_UNICODE *id;
6161 assert(PyUnicode_Check(uni));
6162 id = PyUnicode_AS_UNICODE(uni);
6163 /* Compare Unicode string and source character set string */
6164 for (i = 0; id[i] && str[i]; i++)
6165 if (id[i] != str[i])
6166 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6167 if (id[i])
6168 return 1; /* uni is longer */
6169 if (str[i])
6170 return -1; /* str is longer */
6171 return 0;
6172}
6173
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006174PyObject *PyUnicode_RichCompare(PyObject *left,
6175 PyObject *right,
6176 int op)
6177{
6178 int result;
6179
6180 result = PyUnicode_Compare(left, right);
6181 if (result == -1 && PyErr_Occurred())
6182 goto onError;
6183
6184 /* Convert the return value to a Boolean */
6185 switch (op) {
6186 case Py_EQ:
6187 result = (result == 0);
6188 break;
6189 case Py_NE:
6190 result = (result != 0);
6191 break;
6192 case Py_LE:
6193 result = (result <= 0);
6194 break;
6195 case Py_GE:
6196 result = (result >= 0);
6197 break;
6198 case Py_LT:
6199 result = (result == -1);
6200 break;
6201 case Py_GT:
6202 result = (result == 1);
6203 break;
6204 }
6205 return PyBool_FromLong(result);
6206
6207 onError:
6208
6209 /* Standard case
6210
6211 Type errors mean that PyUnicode_FromObject() could not convert
6212 one of the arguments (usually the right hand side) to Unicode,
6213 ie. we can't handle the comparison request. However, it is
6214 possible that the other object knows a comparison method, which
6215 is why we return Py_NotImplemented to give the other object a
6216 chance.
6217
6218 */
6219 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6220 PyErr_Clear();
6221 Py_INCREF(Py_NotImplemented);
6222 return Py_NotImplemented;
6223 }
6224 if (op != Py_EQ && op != Py_NE)
6225 return NULL;
6226
6227 /* Equality comparison.
6228
6229 This is a special case: we silence any PyExc_UnicodeDecodeError
6230 and instead turn it into a PyErr_UnicodeWarning.
6231
6232 */
6233 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6234 return NULL;
6235 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006236 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6237 (op == Py_EQ) ?
6238 "Unicode equal comparison "
6239 "failed to convert both arguments to Unicode - "
6240 "interpreting them as being unequal"
6241 :
6242 "Unicode unequal comparison "
6243 "failed to convert both arguments to Unicode - "
6244 "interpreting them as being unequal",
6245 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006246 return NULL;
6247 result = (op == Py_NE);
6248 return PyBool_FromLong(result);
6249}
6250
Guido van Rossum403d68b2000-03-13 15:55:09 +00006251int PyUnicode_Contains(PyObject *container,
6252 PyObject *element)
6253{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006254 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006255 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006256
6257 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006258 sub = PyUnicode_FromObject(element);
6259 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006260 PyErr_Format(PyExc_TypeError,
6261 "'in <string>' requires string as left operand, not %s",
6262 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006263 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006264 }
6265
Thomas Wouters477c8d52006-05-27 19:21:47 +00006266 str = PyUnicode_FromObject(container);
6267 if (!str) {
6268 Py_DECREF(sub);
6269 return -1;
6270 }
6271
6272 result = stringlib_contains_obj(str, sub);
6273
6274 Py_DECREF(str);
6275 Py_DECREF(sub);
6276
Guido van Rossum403d68b2000-03-13 15:55:09 +00006277 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006278}
6279
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280/* Concat to string or Unicode object giving a new Unicode object. */
6281
6282PyObject *PyUnicode_Concat(PyObject *left,
6283 PyObject *right)
6284{
6285 PyUnicodeObject *u = NULL, *v = NULL, *w;
6286
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006287 if (PyBytes_Check(left) || PyBytes_Check(right))
6288 return PyBytes_Concat(left, right);
6289
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 /* Coerce the two arguments */
6291 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6292 if (u == NULL)
6293 goto onError;
6294 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6295 if (v == NULL)
6296 goto onError;
6297
6298 /* Shortcuts */
6299 if (v == unicode_empty) {
6300 Py_DECREF(v);
6301 return (PyObject *)u;
6302 }
6303 if (u == unicode_empty) {
6304 Py_DECREF(u);
6305 return (PyObject *)v;
6306 }
6307
6308 /* Concat the two Unicode strings */
6309 w = _PyUnicode_New(u->length + v->length);
6310 if (w == NULL)
6311 goto onError;
6312 Py_UNICODE_COPY(w->str, u->str, u->length);
6313 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6314
6315 Py_DECREF(u);
6316 Py_DECREF(v);
6317 return (PyObject *)w;
6318
6319onError:
6320 Py_XDECREF(u);
6321 Py_XDECREF(v);
6322 return NULL;
6323}
6324
Walter Dörwald1ab83302007-05-18 17:15:44 +00006325void
6326PyUnicode_Append(PyObject **pleft, PyObject *right)
6327{
6328 PyObject *new;
6329 if (*pleft == NULL)
6330 return;
6331 if (right == NULL || !PyUnicode_Check(*pleft)) {
6332 Py_DECREF(*pleft);
6333 *pleft = NULL;
6334 return;
6335 }
6336 new = PyUnicode_Concat(*pleft, right);
6337 Py_DECREF(*pleft);
6338 *pleft = new;
6339}
6340
6341void
6342PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6343{
6344 PyUnicode_Append(pleft, right);
6345 Py_XDECREF(right);
6346}
6347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006348PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349"S.count(sub[, start[, end]]) -> int\n\
6350\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006351Return the number of non-overlapping occurrences of substring sub in\n\
6352Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006353interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354
6355static PyObject *
6356unicode_count(PyUnicodeObject *self, PyObject *args)
6357{
6358 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006359 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006360 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 PyObject *result;
6362
Guido van Rossumb8872e62000-05-09 14:14:27 +00006363 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6364 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 return NULL;
6366
6367 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006368 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 if (substring == NULL)
6370 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006371
Thomas Wouters477c8d52006-05-27 19:21:47 +00006372 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373
Thomas Wouters477c8d52006-05-27 19:21:47 +00006374 result = PyInt_FromSsize_t(
6375 stringlib_count(self->str + start, end - start,
6376 substring->str, substring->length)
6377 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378
6379 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 return result;
6382}
6383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006384PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006385"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006387Encodes S using the codec registered for encoding. encoding defaults\n\
6388to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006389handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6391'xmlcharrefreplace' as well as any other name registered with\n\
6392codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393
6394static PyObject *
6395unicode_encode(PyUnicodeObject *self, PyObject *args)
6396{
6397 char *encoding = NULL;
6398 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006399 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006400
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6402 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006403 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006404 if (v == NULL)
6405 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006406 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006407 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006408 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006409 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006410 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006411 Py_DECREF(v);
6412 return NULL;
6413 }
6414 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006415
6416 onError:
6417 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006418}
6419
6420PyDoc_STRVAR(decode__doc__,
6421"S.decode([encoding[,errors]]) -> string or unicode\n\
6422\n\
6423Decodes S using the codec registered for encoding. encoding defaults\n\
6424to the default encoding. errors may be given to set a different error\n\
6425handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6426a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6427as well as any other name registerd with codecs.register_error that is\n\
6428able to handle UnicodeDecodeErrors.");
6429
6430static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006431unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006432{
Guido van Rossuma74184e2007-08-29 04:05:57 +00006433 PyErr_Format(PyExc_TypeError, "decoding str is not supported");
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435}
6436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006437PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438"S.expandtabs([tabsize]) -> unicode\n\
6439\n\
6440Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006441If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
6443static PyObject*
6444unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6445{
6446 Py_UNICODE *e;
6447 Py_UNICODE *p;
6448 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006449 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 PyUnicodeObject *u;
6451 int tabsize = 8;
6452
6453 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6454 return NULL;
6455
Thomas Wouters7e474022000-07-16 12:04:32 +00006456 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006457 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 e = self->str + self->length;
6459 for (p = self->str; p < e; p++)
6460 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006461 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006463 if (old_j > j) {
6464 PyErr_SetString(PyExc_OverflowError,
6465 "new string is too long");
6466 return NULL;
6467 }
6468 old_j = j;
6469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 }
6471 else {
6472 j++;
6473 if (*p == '\n' || *p == '\r') {
6474 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006475 old_j = j = 0;
6476 if (i < 0) {
6477 PyErr_SetString(PyExc_OverflowError,
6478 "new string is too long");
6479 return NULL;
6480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 }
6482 }
6483
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006484 if ((i + j) < 0) {
6485 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6486 return NULL;
6487 }
6488
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 /* Second pass: create output string and fill it */
6490 u = _PyUnicode_New(i + j);
6491 if (!u)
6492 return NULL;
6493
6494 j = 0;
6495 q = u->str;
6496
6497 for (p = self->str; p < e; p++)
6498 if (*p == '\t') {
6499 if (tabsize > 0) {
6500 i = tabsize - (j % tabsize);
6501 j += i;
6502 while (i--)
6503 *q++ = ' ';
6504 }
6505 }
6506 else {
6507 j++;
6508 *q++ = *p;
6509 if (*p == '\n' || *p == '\r')
6510 j = 0;
6511 }
6512
6513 return (PyObject*) u;
6514}
6515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006516PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517"S.find(sub [,start [,end]]) -> int\n\
6518\n\
6519Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006520such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521arguments start and end are interpreted as in slice notation.\n\
6522\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006523Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
6525static PyObject *
6526unicode_find(PyUnicodeObject *self, PyObject *args)
6527{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006528 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006529 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006530 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006531 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532
Guido van Rossumb8872e62000-05-09 14:14:27 +00006533 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6534 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006536 substring = PyUnicode_FromObject(substring);
6537 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 return NULL;
6539
Thomas Wouters477c8d52006-05-27 19:21:47 +00006540 result = stringlib_find_slice(
6541 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6542 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6543 start, end
6544 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545
6546 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006547
6548 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549}
6550
6551static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006552unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553{
6554 if (index < 0 || index >= self->length) {
6555 PyErr_SetString(PyExc_IndexError, "string index out of range");
6556 return NULL;
6557 }
6558
6559 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6560}
6561
6562static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006563unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006565 /* Since Unicode objects compare equal to their UTF-8 string
6566 counterparts, we hash the UTF-8 string. */
6567 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6568 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569}
6570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006571PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572"S.index(sub [,start [,end]]) -> int\n\
6573\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006574Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575
6576static PyObject *
6577unicode_index(PyUnicodeObject *self, PyObject *args)
6578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006579 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006580 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006581 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006582 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583
Guido van Rossumb8872e62000-05-09 14:14:27 +00006584 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6585 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006587 substring = PyUnicode_FromObject(substring);
6588 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 return NULL;
6590
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591 result = stringlib_find_slice(
6592 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6593 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6594 start, end
6595 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
6597 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 if (result < 0) {
6600 PyErr_SetString(PyExc_ValueError, "substring not found");
6601 return NULL;
6602 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006603
Martin v. Löwis18e16552006-02-15 17:27:45 +00006604 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605}
6606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006607PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006608"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006610Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006611at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
6613static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006614unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615{
6616 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6617 register const Py_UNICODE *e;
6618 int cased;
6619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 /* Shortcut for single character strings */
6621 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006622 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006624 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006625 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006626 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006627
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 e = p + PyUnicode_GET_SIZE(self);
6629 cased = 0;
6630 for (; p < e; p++) {
6631 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006632
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006634 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 else if (!cased && Py_UNICODE_ISLOWER(ch))
6636 cased = 1;
6637 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006638 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639}
6640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006641PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006642"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006644Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006645at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646
6647static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006648unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649{
6650 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6651 register const Py_UNICODE *e;
6652 int cased;
6653
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 /* Shortcut for single character strings */
6655 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006656 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006658 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006659 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006660 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 e = p + PyUnicode_GET_SIZE(self);
6663 cased = 0;
6664 for (; p < e; p++) {
6665 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006666
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006668 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 else if (!cased && Py_UNICODE_ISUPPER(ch))
6670 cased = 1;
6671 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006672 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673}
6674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006675PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006676"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006678Return True if S is a titlecased string and there is at least one\n\
6679character in S, i.e. upper- and titlecase characters may only\n\
6680follow uncased characters and lowercase characters only cased ones.\n\
6681Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
6683static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006684unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685{
6686 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6687 register const Py_UNICODE *e;
6688 int cased, previous_is_cased;
6689
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 /* Shortcut for single character strings */
6691 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006692 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6693 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006695 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006696 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006697 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006698
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 e = p + PyUnicode_GET_SIZE(self);
6700 cased = 0;
6701 previous_is_cased = 0;
6702 for (; p < e; p++) {
6703 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006704
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6706 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006707 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 previous_is_cased = 1;
6709 cased = 1;
6710 }
6711 else if (Py_UNICODE_ISLOWER(ch)) {
6712 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006713 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 previous_is_cased = 1;
6715 cased = 1;
6716 }
6717 else
6718 previous_is_cased = 0;
6719 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006720 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721}
6722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006723PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006724"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006726Return True if all characters in S are whitespace\n\
6727and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
6729static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006730unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731{
6732 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6733 register const Py_UNICODE *e;
6734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 /* Shortcut for single character strings */
6736 if (PyUnicode_GET_SIZE(self) == 1 &&
6737 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006738 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006740 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006741 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006742 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006743
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 e = p + PyUnicode_GET_SIZE(self);
6745 for (; p < e; p++) {
6746 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006747 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006749 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750}
6751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006752PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006753"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006754\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006755Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006756and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006757
6758static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006759unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006760{
6761 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6762 register const Py_UNICODE *e;
6763
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006764 /* Shortcut for single character strings */
6765 if (PyUnicode_GET_SIZE(self) == 1 &&
6766 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006768
6769 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006770 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006771 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006772
6773 e = p + PyUnicode_GET_SIZE(self);
6774 for (; p < e; p++) {
6775 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006776 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006777 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006778 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006779}
6780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006781PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006782"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006783\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006784Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006785and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006786
6787static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006788unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006789{
6790 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6791 register const Py_UNICODE *e;
6792
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006793 /* Shortcut for single character strings */
6794 if (PyUnicode_GET_SIZE(self) == 1 &&
6795 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006796 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006797
6798 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006799 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006800 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006801
6802 e = p + PyUnicode_GET_SIZE(self);
6803 for (; p < e; p++) {
6804 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006805 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006806 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006807 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006808}
6809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006810PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006811"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006813Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006814False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815
6816static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006817unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818{
6819 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6820 register const Py_UNICODE *e;
6821
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 /* Shortcut for single character strings */
6823 if (PyUnicode_GET_SIZE(self) == 1 &&
6824 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006827 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006828 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006830
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 e = p + PyUnicode_GET_SIZE(self);
6832 for (; p < e; p++) {
6833 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006834 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006836 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837}
6838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006839PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006842Return True if all characters in S are digits\n\
6843and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
6845static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006846unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847{
6848 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6849 register const Py_UNICODE *e;
6850
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 /* Shortcut for single character strings */
6852 if (PyUnicode_GET_SIZE(self) == 1 &&
6853 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006856 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006857 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006858 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 e = p + PyUnicode_GET_SIZE(self);
6861 for (; p < e; p++) {
6862 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006863 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006865 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866}
6867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006868PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006871Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006872False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
6874static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006875unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876{
6877 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6878 register const Py_UNICODE *e;
6879
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 /* Shortcut for single character strings */
6881 if (PyUnicode_GET_SIZE(self) == 1 &&
6882 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006885 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006886 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006887 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 e = p + PyUnicode_GET_SIZE(self);
6890 for (; p < e; p++) {
6891 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006894 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895}
6896
Martin v. Löwis47383402007-08-15 07:32:56 +00006897int
6898PyUnicode_IsIdentifier(PyObject *self)
6899{
6900 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6901 register const Py_UNICODE *e;
6902
6903 /* Special case for empty strings */
6904 if (PyUnicode_GET_SIZE(self) == 0)
6905 return 0;
6906
6907 /* PEP 3131 says that the first character must be in
6908 XID_Start and subsequent characters in XID_Continue,
6909 and for the ASCII range, the 2.x rules apply (i.e
6910 start with letters and underscore, continue with
6911 letters, digits, underscore). However, given the current
6912 definition of XID_Start and XID_Continue, it is sufficient
6913 to check just for these, except that _ must be allowed
6914 as starting an identifier. */
6915 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6916 return 0;
6917
6918 e = p + PyUnicode_GET_SIZE(self);
6919 for (p++; p < e; p++) {
6920 if (!_PyUnicode_IsXidContinue(*p))
6921 return 0;
6922 }
6923 return 1;
6924}
6925
6926PyDoc_STRVAR(isidentifier__doc__,
6927"S.isidentifier() -> bool\n\
6928\n\
6929Return True if S is a valid identifier according\n\
6930to the language definition.");
6931
6932static PyObject*
6933unicode_isidentifier(PyObject *self)
6934{
6935 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6936}
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939"S.join(sequence) -> unicode\n\
6940\n\
6941Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006942sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
6944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006945unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006947 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948}
6949
Martin v. Löwis18e16552006-02-15 17:27:45 +00006950static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951unicode_length(PyUnicodeObject *self)
6952{
6953 return self->length;
6954}
6955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006956PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006957"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958\n\
6959Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006960done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961
6962static PyObject *
6963unicode_ljust(PyUnicodeObject *self, PyObject *args)
6964{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006965 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006966 Py_UNICODE fillchar = ' ';
6967
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006968 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 return NULL;
6970
Tim Peters7a29bd52001-09-12 03:03:31 +00006971 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 Py_INCREF(self);
6973 return (PyObject*) self;
6974 }
6975
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006976 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977}
6978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006979PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980"S.lower() -> unicode\n\
6981\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006982Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
6984static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006985unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 return fixup(self, fixlower);
6988}
6989
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006990#define LEFTSTRIP 0
6991#define RIGHTSTRIP 1
6992#define BOTHSTRIP 2
6993
6994/* Arrays indexed by above */
6995static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6996
6997#define STRIPNAME(i) (stripformat[i]+3)
6998
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006999/* externally visible for str.strip(unicode) */
7000PyObject *
7001_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7002{
7003 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007004 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007005 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007006 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7007 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007008
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7010
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007011 i = 0;
7012 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007013 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7014 i++;
7015 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007016 }
7017
7018 j = len;
7019 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007020 do {
7021 j--;
7022 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7023 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007024 }
7025
7026 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007027 Py_INCREF(self);
7028 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007029 }
7030 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007031 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007032}
7033
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034
7035static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007036do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007038 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007039 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040
7041 i = 0;
7042 if (striptype != RIGHTSTRIP) {
7043 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7044 i++;
7045 }
7046 }
7047
7048 j = len;
7049 if (striptype != LEFTSTRIP) {
7050 do {
7051 j--;
7052 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7053 j++;
7054 }
7055
7056 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7057 Py_INCREF(self);
7058 return (PyObject*)self;
7059 }
7060 else
7061 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007064
7065static PyObject *
7066do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7067{
7068 PyObject *sep = NULL;
7069
7070 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7071 return NULL;
7072
7073 if (sep != NULL && sep != Py_None) {
7074 if (PyUnicode_Check(sep))
7075 return _PyUnicode_XStrip(self, striptype, sep);
7076 else if (PyString_Check(sep)) {
7077 PyObject *res;
7078 sep = PyUnicode_FromObject(sep);
7079 if (sep==NULL)
7080 return NULL;
7081 res = _PyUnicode_XStrip(self, striptype, sep);
7082 Py_DECREF(sep);
7083 return res;
7084 }
7085 else {
7086 PyErr_Format(PyExc_TypeError,
7087 "%s arg must be None, unicode or str",
7088 STRIPNAME(striptype));
7089 return NULL;
7090 }
7091 }
7092
7093 return do_strip(self, striptype);
7094}
7095
7096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007097PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007098"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007099\n\
7100Return a copy of the string S with leading and trailing\n\
7101whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007102If chars is given and not None, remove characters in chars instead.\n\
7103If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007104
7105static PyObject *
7106unicode_strip(PyUnicodeObject *self, PyObject *args)
7107{
7108 if (PyTuple_GET_SIZE(args) == 0)
7109 return do_strip(self, BOTHSTRIP); /* Common case */
7110 else
7111 return do_argstrip(self, BOTHSTRIP, args);
7112}
7113
7114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007115PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007116"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117\n\
7118Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007119If chars is given and not None, remove characters in chars instead.\n\
7120If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007121
7122static PyObject *
7123unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7124{
7125 if (PyTuple_GET_SIZE(args) == 0)
7126 return do_strip(self, LEFTSTRIP); /* Common case */
7127 else
7128 return do_argstrip(self, LEFTSTRIP, args);
7129}
7130
7131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007132PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007133"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007134\n\
7135Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007136If chars is given and not None, remove characters in chars instead.\n\
7137If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007138
7139static PyObject *
7140unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7141{
7142 if (PyTuple_GET_SIZE(args) == 0)
7143 return do_strip(self, RIGHTSTRIP); /* Common case */
7144 else
7145 return do_argstrip(self, RIGHTSTRIP, args);
7146}
7147
7148
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007150unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151{
7152 PyUnicodeObject *u;
7153 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007154 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007155 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
7157 if (len < 0)
7158 len = 0;
7159
Tim Peters7a29bd52001-09-12 03:03:31 +00007160 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 /* no repeat, return original string */
7162 Py_INCREF(str);
7163 return (PyObject*) str;
7164 }
Tim Peters8f422462000-09-09 06:13:41 +00007165
7166 /* ensure # of chars needed doesn't overflow int and # of bytes
7167 * needed doesn't overflow size_t
7168 */
7169 nchars = len * str->length;
7170 if (len && nchars / len != str->length) {
7171 PyErr_SetString(PyExc_OverflowError,
7172 "repeated string is too long");
7173 return NULL;
7174 }
7175 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7176 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7177 PyErr_SetString(PyExc_OverflowError,
7178 "repeated string is too long");
7179 return NULL;
7180 }
7181 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 if (!u)
7183 return NULL;
7184
7185 p = u->str;
7186
Thomas Wouters477c8d52006-05-27 19:21:47 +00007187 if (str->length == 1 && len > 0) {
7188 Py_UNICODE_FILL(p, str->str[0], len);
7189 } else {
7190 Py_ssize_t done = 0; /* number of characters copied this far */
7191 if (done < nchars) {
7192 Py_UNICODE_COPY(p, str->str, str->length);
7193 done = str->length;
7194 }
7195 while (done < nchars) {
7196 int n = (done <= nchars-done) ? done : nchars-done;
7197 Py_UNICODE_COPY(p+done, p, n);
7198 done += n;
7199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 }
7201
7202 return (PyObject*) u;
7203}
7204
7205PyObject *PyUnicode_Replace(PyObject *obj,
7206 PyObject *subobj,
7207 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007208 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209{
7210 PyObject *self;
7211 PyObject *str1;
7212 PyObject *str2;
7213 PyObject *result;
7214
7215 self = PyUnicode_FromObject(obj);
7216 if (self == NULL)
7217 return NULL;
7218 str1 = PyUnicode_FromObject(subobj);
7219 if (str1 == NULL) {
7220 Py_DECREF(self);
7221 return NULL;
7222 }
7223 str2 = PyUnicode_FromObject(replobj);
7224 if (str2 == NULL) {
7225 Py_DECREF(self);
7226 Py_DECREF(str1);
7227 return NULL;
7228 }
Tim Petersced69f82003-09-16 20:30:58 +00007229 result = replace((PyUnicodeObject *)self,
7230 (PyUnicodeObject *)str1,
7231 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 maxcount);
7233 Py_DECREF(self);
7234 Py_DECREF(str1);
7235 Py_DECREF(str2);
7236 return result;
7237}
7238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007239PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240"S.replace (old, new[, maxsplit]) -> unicode\n\
7241\n\
7242Return a copy of S with all occurrences of substring\n\
7243old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007244given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245
7246static PyObject*
7247unicode_replace(PyUnicodeObject *self, PyObject *args)
7248{
7249 PyUnicodeObject *str1;
7250 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007251 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 PyObject *result;
7253
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 return NULL;
7256 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7257 if (str1 == NULL)
7258 return NULL;
7259 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007260 if (str2 == NULL) {
7261 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264
7265 result = replace(self, str1, str2, maxcount);
7266
7267 Py_DECREF(str1);
7268 Py_DECREF(str2);
7269 return result;
7270}
7271
7272static
7273PyObject *unicode_repr(PyObject *unicode)
7274{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007275 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007276 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007277 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7278 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7279
7280 /* XXX(nnorwitz): rather than over-allocating, it would be
7281 better to choose a different scheme. Perhaps scan the
7282 first N-chars of the string and allocate based on that size.
7283 */
7284 /* Initial allocation is based on the longest-possible unichr
7285 escape.
7286
7287 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7288 unichr, so in this case it's the longest unichr escape. In
7289 narrow (UTF-16) builds this is five chars per source unichr
7290 since there are two unichrs in the surrogate pair, so in narrow
7291 (UTF-16) builds it's not the longest unichr escape.
7292
7293 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7294 so in the narrow (UTF-16) build case it's the longest unichr
7295 escape.
7296 */
7297
Walter Dörwald1ab83302007-05-18 17:15:44 +00007298 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007299 2 /* quotes */
7300#ifdef Py_UNICODE_WIDE
7301 + 10*size
7302#else
7303 + 6*size
7304#endif
7305 + 1);
7306 if (repr == NULL)
7307 return NULL;
7308
Walter Dörwald1ab83302007-05-18 17:15:44 +00007309 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007310
7311 /* Add quote */
7312 *p++ = (findchar(s, size, '\'') &&
7313 !findchar(s, size, '"')) ? '"' : '\'';
7314 while (size-- > 0) {
7315 Py_UNICODE ch = *s++;
7316
7317 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007318 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007319 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007320 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007321 continue;
7322 }
7323
7324#ifdef Py_UNICODE_WIDE
7325 /* Map 21-bit characters to '\U00xxxxxx' */
7326 else if (ch >= 0x10000) {
7327 *p++ = '\\';
7328 *p++ = 'U';
7329 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7330 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7331 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7332 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7333 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7334 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7335 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7336 *p++ = hexdigits[ch & 0x0000000F];
7337 continue;
7338 }
7339#else
7340 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7341 else if (ch >= 0xD800 && ch < 0xDC00) {
7342 Py_UNICODE ch2;
7343 Py_UCS4 ucs;
7344
7345 ch2 = *s++;
7346 size--;
7347 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7348 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7349 *p++ = '\\';
7350 *p++ = 'U';
7351 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7352 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7353 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7354 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7355 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7356 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7357 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7358 *p++ = hexdigits[ucs & 0x0000000F];
7359 continue;
7360 }
7361 /* Fall through: isolated surrogates are copied as-is */
7362 s--;
7363 size++;
7364 }
7365#endif
7366
7367 /* Map 16-bit characters to '\uxxxx' */
7368 if (ch >= 256) {
7369 *p++ = '\\';
7370 *p++ = 'u';
7371 *p++ = hexdigits[(ch >> 12) & 0x000F];
7372 *p++ = hexdigits[(ch >> 8) & 0x000F];
7373 *p++ = hexdigits[(ch >> 4) & 0x000F];
7374 *p++ = hexdigits[ch & 0x000F];
7375 }
7376
7377 /* Map special whitespace to '\t', \n', '\r' */
7378 else if (ch == '\t') {
7379 *p++ = '\\';
7380 *p++ = 't';
7381 }
7382 else if (ch == '\n') {
7383 *p++ = '\\';
7384 *p++ = 'n';
7385 }
7386 else if (ch == '\r') {
7387 *p++ = '\\';
7388 *p++ = 'r';
7389 }
7390
7391 /* Map non-printable US ASCII to '\xhh' */
7392 else if (ch < ' ' || ch >= 0x7F) {
7393 *p++ = '\\';
7394 *p++ = 'x';
7395 *p++ = hexdigits[(ch >> 4) & 0x000F];
7396 *p++ = hexdigits[ch & 0x000F];
7397 }
7398
7399 /* Copy everything else as-is */
7400 else
7401 *p++ = (char) ch;
7402 }
7403 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007404 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007405
7406 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007407 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007408 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409}
7410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007411PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412"S.rfind(sub [,start [,end]]) -> int\n\
7413\n\
7414Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007415such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416arguments start and end are interpreted as in slice notation.\n\
7417\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007418Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419
7420static PyObject *
7421unicode_rfind(PyUnicodeObject *self, PyObject *args)
7422{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007423 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007424 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007425 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007426 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427
Guido van Rossumb8872e62000-05-09 14:14:27 +00007428 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7429 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007431 substring = PyUnicode_FromObject(substring);
7432 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433 return NULL;
7434
Thomas Wouters477c8d52006-05-27 19:21:47 +00007435 result = stringlib_rfind_slice(
7436 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7437 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7438 start, end
7439 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
7441 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007442
7443 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444}
7445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007446PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447"S.rindex(sub [,start [,end]]) -> int\n\
7448\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007449Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450
7451static PyObject *
7452unicode_rindex(PyUnicodeObject *self, PyObject *args)
7453{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007454 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007455 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007456 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007457 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458
Guido van Rossumb8872e62000-05-09 14:14:27 +00007459 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7460 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007462 substring = PyUnicode_FromObject(substring);
7463 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 return NULL;
7465
Thomas Wouters477c8d52006-05-27 19:21:47 +00007466 result = stringlib_rfind_slice(
7467 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7468 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7469 start, end
7470 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
7472 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007473
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 if (result < 0) {
7475 PyErr_SetString(PyExc_ValueError, "substring not found");
7476 return NULL;
7477 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007478 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479}
7480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007481PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007482"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483\n\
7484Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007485done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486
7487static PyObject *
7488unicode_rjust(PyUnicodeObject *self, PyObject *args)
7489{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007490 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007491 Py_UNICODE fillchar = ' ';
7492
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007493 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494 return NULL;
7495
Tim Peters7a29bd52001-09-12 03:03:31 +00007496 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 Py_INCREF(self);
7498 return (PyObject*) self;
7499 }
7500
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007501 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502}
7503
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506{
7507 /* standard clamping */
7508 if (start < 0)
7509 start = 0;
7510 if (end < 0)
7511 end = 0;
7512 if (end > self->length)
7513 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007514 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 /* full slice, return original string */
7516 Py_INCREF(self);
7517 return (PyObject*) self;
7518 }
7519 if (start > end)
7520 start = end;
7521 /* copy slice */
7522 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7523 end - start);
7524}
7525
7526PyObject *PyUnicode_Split(PyObject *s,
7527 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007528 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529{
7530 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007531
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 s = PyUnicode_FromObject(s);
7533 if (s == NULL)
7534 return NULL;
7535 if (sep != NULL) {
7536 sep = PyUnicode_FromObject(sep);
7537 if (sep == NULL) {
7538 Py_DECREF(s);
7539 return NULL;
7540 }
7541 }
7542
7543 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7544
7545 Py_DECREF(s);
7546 Py_XDECREF(sep);
7547 return result;
7548}
7549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007550PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551"S.split([sep [,maxsplit]]) -> list of strings\n\
7552\n\
7553Return a list of the words in S, using sep as the\n\
7554delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007555splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007556any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557
7558static PyObject*
7559unicode_split(PyUnicodeObject *self, PyObject *args)
7560{
7561 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007562 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
Martin v. Löwis18e16552006-02-15 17:27:45 +00007564 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 return NULL;
7566
7567 if (substring == Py_None)
7568 return split(self, NULL, maxcount);
7569 else if (PyUnicode_Check(substring))
7570 return split(self, (PyUnicodeObject *)substring, maxcount);
7571 else
7572 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7573}
7574
Thomas Wouters477c8d52006-05-27 19:21:47 +00007575PyObject *
7576PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7577{
7578 PyObject* str_obj;
7579 PyObject* sep_obj;
7580 PyObject* out;
7581
7582 str_obj = PyUnicode_FromObject(str_in);
7583 if (!str_obj)
7584 return NULL;
7585 sep_obj = PyUnicode_FromObject(sep_in);
7586 if (!sep_obj) {
7587 Py_DECREF(str_obj);
7588 return NULL;
7589 }
7590
7591 out = stringlib_partition(
7592 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7593 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7594 );
7595
7596 Py_DECREF(sep_obj);
7597 Py_DECREF(str_obj);
7598
7599 return out;
7600}
7601
7602
7603PyObject *
7604PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7605{
7606 PyObject* str_obj;
7607 PyObject* sep_obj;
7608 PyObject* out;
7609
7610 str_obj = PyUnicode_FromObject(str_in);
7611 if (!str_obj)
7612 return NULL;
7613 sep_obj = PyUnicode_FromObject(sep_in);
7614 if (!sep_obj) {
7615 Py_DECREF(str_obj);
7616 return NULL;
7617 }
7618
7619 out = stringlib_rpartition(
7620 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7621 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7622 );
7623
7624 Py_DECREF(sep_obj);
7625 Py_DECREF(str_obj);
7626
7627 return out;
7628}
7629
7630PyDoc_STRVAR(partition__doc__,
7631"S.partition(sep) -> (head, sep, tail)\n\
7632\n\
7633Searches for the separator sep in S, and returns the part before it,\n\
7634the separator itself, and the part after it. If the separator is not\n\
7635found, returns S and two empty strings.");
7636
7637static PyObject*
7638unicode_partition(PyUnicodeObject *self, PyObject *separator)
7639{
7640 return PyUnicode_Partition((PyObject *)self, separator);
7641}
7642
7643PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007644"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007645\n\
7646Searches for the separator sep in S, starting at the end of S, and returns\n\
7647the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007648separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007649
7650static PyObject*
7651unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7652{
7653 return PyUnicode_RPartition((PyObject *)self, separator);
7654}
7655
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007656PyObject *PyUnicode_RSplit(PyObject *s,
7657 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007658 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007659{
7660 PyObject *result;
7661
7662 s = PyUnicode_FromObject(s);
7663 if (s == NULL)
7664 return NULL;
7665 if (sep != NULL) {
7666 sep = PyUnicode_FromObject(sep);
7667 if (sep == NULL) {
7668 Py_DECREF(s);
7669 return NULL;
7670 }
7671 }
7672
7673 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7674
7675 Py_DECREF(s);
7676 Py_XDECREF(sep);
7677 return result;
7678}
7679
7680PyDoc_STRVAR(rsplit__doc__,
7681"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7682\n\
7683Return a list of the words in S, using sep as the\n\
7684delimiter string, starting at the end of the string and\n\
7685working to the front. If maxsplit is given, at most maxsplit\n\
7686splits are done. If sep is not specified, any whitespace string\n\
7687is a separator.");
7688
7689static PyObject*
7690unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7691{
7692 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007694
Martin v. Löwis18e16552006-02-15 17:27:45 +00007695 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007696 return NULL;
7697
7698 if (substring == Py_None)
7699 return rsplit(self, NULL, maxcount);
7700 else if (PyUnicode_Check(substring))
7701 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7702 else
7703 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7704}
7705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007706PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007707"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708\n\
7709Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007710Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007711is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712
7713static PyObject*
7714unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7715{
Guido van Rossum86662912000-04-11 15:38:46 +00007716 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717
Guido van Rossum86662912000-04-11 15:38:46 +00007718 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 return NULL;
7720
Guido van Rossum86662912000-04-11 15:38:46 +00007721 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722}
7723
7724static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007725PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726{
Walter Dörwald346737f2007-05-31 10:44:43 +00007727 if (PyUnicode_CheckExact(self)) {
7728 Py_INCREF(self);
7729 return self;
7730 } else
7731 /* Subtype -- return genuine unicode string with the same value. */
7732 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7733 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734}
7735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007736PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737"S.swapcase() -> unicode\n\
7738\n\
7739Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007740and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741
7742static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007743unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 return fixup(self, fixswapcase);
7746}
7747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749"S.translate(table) -> unicode\n\
7750\n\
7751Return a copy of the string S, where all characters have been mapped\n\
7752through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007753Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7754Unmapped characters are left untouched. Characters mapped to None\n\
7755are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756
7757static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007758unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759{
Tim Petersced69f82003-09-16 20:30:58 +00007760 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007762 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 "ignore");
7764}
7765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007766PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767"S.upper() -> unicode\n\
7768\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007769Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770
7771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007772unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774 return fixup(self, fixupper);
7775}
7776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007777PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778"S.zfill(width) -> unicode\n\
7779\n\
7780Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007781of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782
7783static PyObject *
7784unicode_zfill(PyUnicodeObject *self, PyObject *args)
7785{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007786 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 PyUnicodeObject *u;
7788
Martin v. Löwis18e16552006-02-15 17:27:45 +00007789 Py_ssize_t width;
7790 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791 return NULL;
7792
7793 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007794 if (PyUnicode_CheckExact(self)) {
7795 Py_INCREF(self);
7796 return (PyObject*) self;
7797 }
7798 else
7799 return PyUnicode_FromUnicode(
7800 PyUnicode_AS_UNICODE(self),
7801 PyUnicode_GET_SIZE(self)
7802 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803 }
7804
7805 fill = width - self->length;
7806
7807 u = pad(self, fill, 0, '0');
7808
Walter Dörwald068325e2002-04-15 13:36:47 +00007809 if (u == NULL)
7810 return NULL;
7811
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 if (u->str[fill] == '+' || u->str[fill] == '-') {
7813 /* move sign to beginning of string */
7814 u->str[0] = u->str[fill];
7815 u->str[fill] = '0';
7816 }
7817
7818 return (PyObject*) u;
7819}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820
7821#if 0
7822static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007823unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 return PyInt_FromLong(unicode_freelist_size);
7826}
7827#endif
7828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007829PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007830"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007832Return True if S starts with the specified prefix, False otherwise.\n\
7833With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007834With optional end, stop comparing S at that position.\n\
7835prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836
7837static PyObject *
7838unicode_startswith(PyUnicodeObject *self,
7839 PyObject *args)
7840{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007841 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007843 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007844 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007845 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007847 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007848 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007850 if (PyTuple_Check(subobj)) {
7851 Py_ssize_t i;
7852 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7853 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7854 PyTuple_GET_ITEM(subobj, i));
7855 if (substring == NULL)
7856 return NULL;
7857 result = tailmatch(self, substring, start, end, -1);
7858 Py_DECREF(substring);
7859 if (result) {
7860 Py_RETURN_TRUE;
7861 }
7862 }
7863 /* nothing matched */
7864 Py_RETURN_FALSE;
7865 }
7866 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007868 return NULL;
7869 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007871 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872}
7873
7874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007875PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007876"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007878Return True if S ends with the specified suffix, False otherwise.\n\
7879With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007880With optional end, stop comparing S at that position.\n\
7881suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882
7883static PyObject *
7884unicode_endswith(PyUnicodeObject *self,
7885 PyObject *args)
7886{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007887 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007889 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007890 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007891 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007893 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7894 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007896 if (PyTuple_Check(subobj)) {
7897 Py_ssize_t i;
7898 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7899 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7900 PyTuple_GET_ITEM(subobj, i));
7901 if (substring == NULL)
7902 return NULL;
7903 result = tailmatch(self, substring, start, end, +1);
7904 Py_DECREF(substring);
7905 if (result) {
7906 Py_RETURN_TRUE;
7907 }
7908 }
7909 Py_RETURN_FALSE;
7910 }
7911 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007913 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007915 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007917 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918}
7919
Eric Smith8c663262007-08-25 02:26:07 +00007920#include "stringlib/string_format.h"
7921
7922PyDoc_STRVAR(format__doc__,
7923"S.format(*args, **kwargs) -> unicode\n\
7924\n\
7925");
7926
7927static PyObject *
7928unicode_format(PyObject *self, PyObject *args, PyObject *kwds)
7929{
7930 /* this calls into stringlib/string_format.h because it can be
7931 included for either string or unicode. this is needed for
7932 python 2.6. */
7933 return do_string_format(self, args, kwds);
7934}
7935
7936
7937PyDoc_STRVAR(p_format__doc__,
7938"S.__format__(format_spec) -> unicode\n\
7939\n\
7940");
7941
7942static PyObject *
7943unicode__format__(PyObject *self, PyObject *args)
7944{
7945 return unicode_unicode__format__(self, args);
7946}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007948
7949static PyObject *
7950unicode_getnewargs(PyUnicodeObject *v)
7951{
7952 return Py_BuildValue("(u#)", v->str, v->length);
7953}
7954
7955
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956static PyMethodDef unicode_methods[] = {
7957
7958 /* Order is according to common usage: often used methods should
7959 appear first, since lookup is done sequentially. */
7960
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007961 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7962 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7963 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007964 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007965 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7966 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7967 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7968 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7969 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7970 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7971 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007972 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007973 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7974 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7975 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007976 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007977 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007978/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7979 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7980 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7981 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007982 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007983 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007984 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007985 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007986 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7987 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7988 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7989 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7990 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7991 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7992 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7993 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7994 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7995 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7996 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7997 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7998 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7999 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008000 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008001 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith8c663262007-08-25 02:26:07 +00008002 {"format", (PyCFunction) unicode_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8003 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008004 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8005 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008006#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008007 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008#endif
8009
8010#if 0
8011 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008012 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013#endif
8014
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008015 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 {NULL, NULL}
8017};
8018
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008019static PyObject *
8020unicode_mod(PyObject *v, PyObject *w)
8021{
8022 if (!PyUnicode_Check(v)) {
8023 Py_INCREF(Py_NotImplemented);
8024 return Py_NotImplemented;
8025 }
8026 return PyUnicode_Format(v, w);
8027}
8028
8029static PyNumberMethods unicode_as_number = {
8030 0, /*nb_add*/
8031 0, /*nb_subtract*/
8032 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008033 unicode_mod, /*nb_remainder*/
8034};
8035
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008037 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008038 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008039 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8040 (ssizeargfunc) unicode_getitem, /* sq_item */
8041 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 0, /* sq_ass_item */
8043 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008044 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045};
8046
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008047static PyObject*
8048unicode_subscript(PyUnicodeObject* self, PyObject* item)
8049{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008050 if (PyIndex_Check(item)) {
8051 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008052 if (i == -1 && PyErr_Occurred())
8053 return NULL;
8054 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008055 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008056 return unicode_getitem(self, i);
8057 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008058 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008059 Py_UNICODE* source_buf;
8060 Py_UNICODE* result_buf;
8061 PyObject* result;
8062
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008063 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008064 &start, &stop, &step, &slicelength) < 0) {
8065 return NULL;
8066 }
8067
8068 if (slicelength <= 0) {
8069 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008070 } else if (start == 0 && step == 1 && slicelength == self->length &&
8071 PyUnicode_CheckExact(self)) {
8072 Py_INCREF(self);
8073 return (PyObject *)self;
8074 } else if (step == 1) {
8075 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008076 } else {
8077 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008078 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8079 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008080
8081 if (result_buf == NULL)
8082 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008083
8084 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8085 result_buf[i] = source_buf[cur];
8086 }
Tim Petersced69f82003-09-16 20:30:58 +00008087
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008088 result = PyUnicode_FromUnicode(result_buf, slicelength);
8089 PyMem_FREE(result_buf);
8090 return result;
8091 }
8092 } else {
8093 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8094 return NULL;
8095 }
8096}
8097
8098static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008099 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008100 (binaryfunc)unicode_subscript, /* mp_subscript */
8101 (objobjargproc)0, /* mp_ass_subscript */
8102};
8103
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104
8105static int
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008106unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008109 if (flags & PyBUF_CHARACTER) {
Guido van Rossuma74184e2007-08-29 04:05:57 +00008110 PyErr_SetString(PyExc_SystemError, "can't use str as char buffer");
8111 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 }
Guido van Rossuma74184e2007-08-29 04:05:57 +00008113 return PyBuffer_FillInfo(view, (void *)self->str,
8114 PyUnicode_GET_DATA_SIZE(self), 1, flags);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115}
8116
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008117
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118/* Helpers for PyUnicode_Format() */
8119
8120static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008121getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008123 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 if (argidx < arglen) {
8125 (*p_argidx)++;
8126 if (arglen < 0)
8127 return args;
8128 else
8129 return PyTuple_GetItem(args, argidx);
8130 }
8131 PyErr_SetString(PyExc_TypeError,
8132 "not enough arguments for format string");
8133 return NULL;
8134}
8135
8136#define F_LJUST (1<<0)
8137#define F_SIGN (1<<1)
8138#define F_BLANK (1<<2)
8139#define F_ALT (1<<3)
8140#define F_ZERO (1<<4)
8141
Martin v. Löwis18e16552006-02-15 17:27:45 +00008142static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008143strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008145 register Py_ssize_t i;
8146 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 for (i = len - 1; i >= 0; i--)
8148 buffer[i] = (Py_UNICODE) charbuffer[i];
8149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 return len;
8151}
8152
Neal Norwitzfc76d632006-01-10 06:03:13 +00008153static int
8154doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8155{
Tim Peters15231542006-02-16 01:08:01 +00008156 Py_ssize_t result;
8157
Neal Norwitzfc76d632006-01-10 06:03:13 +00008158 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008159 result = strtounicode(buffer, (char *)buffer);
8160 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008161}
8162
8163static int
8164longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8165{
Tim Peters15231542006-02-16 01:08:01 +00008166 Py_ssize_t result;
8167
Neal Norwitzfc76d632006-01-10 06:03:13 +00008168 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008169 result = strtounicode(buffer, (char *)buffer);
8170 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008171}
8172
Guido van Rossum078151d2002-08-11 04:24:12 +00008173/* XXX To save some code duplication, formatfloat/long/int could have been
8174 shared with stringobject.c, converting from 8-bit to Unicode after the
8175 formatting is done. */
8176
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177static int
8178formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008179 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 int flags,
8181 int prec,
8182 int type,
8183 PyObject *v)
8184{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008185 /* fmt = '%#.' + `prec` + `type`
8186 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 char fmt[20];
8188 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008189
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 x = PyFloat_AsDouble(v);
8191 if (x == -1.0 && PyErr_Occurred())
8192 return -1;
8193 if (prec < 0)
8194 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8196 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008197 /* Worst case length calc to ensure no buffer overrun:
8198
8199 'g' formats:
8200 fmt = %#.<prec>g
8201 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8202 for any double rep.)
8203 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8204
8205 'f' formats:
8206 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8207 len = 1 + 50 + 1 + prec = 52 + prec
8208
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008209 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008210 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008211
8212 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008213 if (((type == 'g' || type == 'G') &&
8214 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008215 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008216 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008217 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008218 return -1;
8219 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008220 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8221 (flags&F_ALT) ? "#" : "",
8222 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008223 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224}
8225
Tim Peters38fd5b62000-09-21 05:43:11 +00008226static PyObject*
8227formatlong(PyObject *val, int flags, int prec, int type)
8228{
8229 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008230 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008231 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008232 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008233
8234 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8235 if (!str)
8236 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008237 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008238 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008239 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008240}
8241
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242static int
8243formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008244 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 int flags,
8246 int prec,
8247 int type,
8248 PyObject *v)
8249{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008250 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008251 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8252 * + 1 + 1
8253 * = 24
8254 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008255 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008256 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 long x;
8258
8259 x = PyInt_AsLong(v);
8260 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008261 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008262 if (x < 0 && type == 'u') {
8263 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008264 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008265 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8266 sign = "-";
8267 else
8268 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008270 prec = 1;
8271
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008272 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8273 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008274 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008275 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008276 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008277 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008278 return -1;
8279 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008280
8281 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008282 (type == 'x' || type == 'X' || type == 'o')) {
8283 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008284 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008285 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008286 * - when 0 is being converted, the C standard leaves off
8287 * the '0x' or '0X', which is inconsistent with other
8288 * %#x/%#X conversions and inconsistent with Python's
8289 * hex() function
8290 * - there are platforms that violate the standard and
8291 * convert 0 with the '0x' or '0X'
8292 * (Metrowerks, Compaq Tru64)
8293 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008294 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008295 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008296 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008297 * We can achieve the desired consistency by inserting our
8298 * own '0x' or '0X' prefix, and substituting %x/%X in place
8299 * of %#x/%#X.
8300 *
8301 * Note that this is the same approach as used in
8302 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008303 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008304 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8305 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008306 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008307 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008308 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8309 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008310 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008311 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008312 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008313 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008314 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008315 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316}
8317
8318static int
8319formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008320 size_t buflen,
8321 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008323 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008324 if (PyUnicode_Check(v)) {
8325 if (PyUnicode_GET_SIZE(v) != 1)
8326 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008330 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008331 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008332 goto onError;
8333 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335
8336 else {
8337 /* Integer input truncated to a character */
8338 long x;
8339 x = PyInt_AsLong(v);
8340 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008341 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008342#ifdef Py_UNICODE_WIDE
8343 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008344 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008345 "%c arg not in range(0x110000) "
8346 "(wide Python build)");
8347 return -1;
8348 }
8349#else
8350 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008351 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008352 "%c arg not in range(0x10000) "
8353 "(narrow Python build)");
8354 return -1;
8355 }
8356#endif
8357 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 }
8359 buf[1] = '\0';
8360 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008361
8362 onError:
8363 PyErr_SetString(PyExc_TypeError,
8364 "%c requires int or char");
8365 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366}
8367
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008368/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8369
8370 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8371 chars are formatted. XXX This is a magic number. Each formatting
8372 routine does bounds checking to ensure no overflow, but a better
8373 solution may be to malloc a buffer of appropriate size for each
8374 format. For now, the current solution is sufficient.
8375*/
8376#define FORMATBUFLEN (size_t)120
8377
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378PyObject *PyUnicode_Format(PyObject *format,
8379 PyObject *args)
8380{
8381 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008382 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 int args_owned = 0;
8384 PyUnicodeObject *result = NULL;
8385 PyObject *dict = NULL;
8386 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008387
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 if (format == NULL || args == NULL) {
8389 PyErr_BadInternalCall();
8390 return NULL;
8391 }
8392 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008393 if (uformat == NULL)
8394 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 fmt = PyUnicode_AS_UNICODE(uformat);
8396 fmtcnt = PyUnicode_GET_SIZE(uformat);
8397
8398 reslen = rescnt = fmtcnt + 100;
8399 result = _PyUnicode_New(reslen);
8400 if (result == NULL)
8401 goto onError;
8402 res = PyUnicode_AS_UNICODE(result);
8403
8404 if (PyTuple_Check(args)) {
8405 arglen = PyTuple_Size(args);
8406 argidx = 0;
8407 }
8408 else {
8409 arglen = -1;
8410 argidx = -2;
8411 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008412 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008413 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 dict = args;
8415
8416 while (--fmtcnt >= 0) {
8417 if (*fmt != '%') {
8418 if (--rescnt < 0) {
8419 rescnt = fmtcnt + 100;
8420 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008421 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008422 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8424 --rescnt;
8425 }
8426 *res++ = *fmt++;
8427 }
8428 else {
8429 /* Got a format specifier */
8430 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008431 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 Py_UNICODE c = '\0';
8434 Py_UNICODE fill;
8435 PyObject *v = NULL;
8436 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008437 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008439 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008440 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441
8442 fmt++;
8443 if (*fmt == '(') {
8444 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008445 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 PyObject *key;
8447 int pcount = 1;
8448
8449 if (dict == NULL) {
8450 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008451 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 goto onError;
8453 }
8454 ++fmt;
8455 --fmtcnt;
8456 keystart = fmt;
8457 /* Skip over balanced parentheses */
8458 while (pcount > 0 && --fmtcnt >= 0) {
8459 if (*fmt == ')')
8460 --pcount;
8461 else if (*fmt == '(')
8462 ++pcount;
8463 fmt++;
8464 }
8465 keylen = fmt - keystart - 1;
8466 if (fmtcnt < 0 || pcount > 0) {
8467 PyErr_SetString(PyExc_ValueError,
8468 "incomplete format key");
8469 goto onError;
8470 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008471#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008472 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 then looked up since Python uses strings to hold
8474 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008475 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 key = PyUnicode_EncodeUTF8(keystart,
8477 keylen,
8478 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008479#else
8480 key = PyUnicode_FromUnicode(keystart, keylen);
8481#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 if (key == NULL)
8483 goto onError;
8484 if (args_owned) {
8485 Py_DECREF(args);
8486 args_owned = 0;
8487 }
8488 args = PyObject_GetItem(dict, key);
8489 Py_DECREF(key);
8490 if (args == NULL) {
8491 goto onError;
8492 }
8493 args_owned = 1;
8494 arglen = -1;
8495 argidx = -2;
8496 }
8497 while (--fmtcnt >= 0) {
8498 switch (c = *fmt++) {
8499 case '-': flags |= F_LJUST; continue;
8500 case '+': flags |= F_SIGN; continue;
8501 case ' ': flags |= F_BLANK; continue;
8502 case '#': flags |= F_ALT; continue;
8503 case '0': flags |= F_ZERO; continue;
8504 }
8505 break;
8506 }
8507 if (c == '*') {
8508 v = getnextarg(args, arglen, &argidx);
8509 if (v == NULL)
8510 goto onError;
8511 if (!PyInt_Check(v)) {
8512 PyErr_SetString(PyExc_TypeError,
8513 "* wants int");
8514 goto onError;
8515 }
8516 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008517 if (width == -1 && PyErr_Occurred())
8518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 if (width < 0) {
8520 flags |= F_LJUST;
8521 width = -width;
8522 }
8523 if (--fmtcnt >= 0)
8524 c = *fmt++;
8525 }
8526 else if (c >= '0' && c <= '9') {
8527 width = c - '0';
8528 while (--fmtcnt >= 0) {
8529 c = *fmt++;
8530 if (c < '0' || c > '9')
8531 break;
8532 if ((width*10) / 10 != width) {
8533 PyErr_SetString(PyExc_ValueError,
8534 "width too big");
8535 goto onError;
8536 }
8537 width = width*10 + (c - '0');
8538 }
8539 }
8540 if (c == '.') {
8541 prec = 0;
8542 if (--fmtcnt >= 0)
8543 c = *fmt++;
8544 if (c == '*') {
8545 v = getnextarg(args, arglen, &argidx);
8546 if (v == NULL)
8547 goto onError;
8548 if (!PyInt_Check(v)) {
8549 PyErr_SetString(PyExc_TypeError,
8550 "* wants int");
8551 goto onError;
8552 }
8553 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008554 if (prec == -1 && PyErr_Occurred())
8555 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 if (prec < 0)
8557 prec = 0;
8558 if (--fmtcnt >= 0)
8559 c = *fmt++;
8560 }
8561 else if (c >= '0' && c <= '9') {
8562 prec = c - '0';
8563 while (--fmtcnt >= 0) {
8564 c = Py_CHARMASK(*fmt++);
8565 if (c < '0' || c > '9')
8566 break;
8567 if ((prec*10) / 10 != prec) {
8568 PyErr_SetString(PyExc_ValueError,
8569 "prec too big");
8570 goto onError;
8571 }
8572 prec = prec*10 + (c - '0');
8573 }
8574 }
8575 } /* prec */
8576 if (fmtcnt >= 0) {
8577 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 if (--fmtcnt >= 0)
8579 c = *fmt++;
8580 }
8581 }
8582 if (fmtcnt < 0) {
8583 PyErr_SetString(PyExc_ValueError,
8584 "incomplete format");
8585 goto onError;
8586 }
8587 if (c != '%') {
8588 v = getnextarg(args, arglen, &argidx);
8589 if (v == NULL)
8590 goto onError;
8591 }
8592 sign = 0;
8593 fill = ' ';
8594 switch (c) {
8595
8596 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008597 pbuf = formatbuf;
8598 /* presume that buffer length is at least 1 */
8599 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 len = 1;
8601 break;
8602
8603 case 's':
8604 case 'r':
8605 if (PyUnicode_Check(v) && c == 's') {
8606 temp = v;
8607 Py_INCREF(temp);
8608 }
8609 else {
8610 PyObject *unicode;
8611 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008612 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 else
8614 temp = PyObject_Repr(v);
8615 if (temp == NULL)
8616 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008617 if (PyUnicode_Check(temp))
8618 /* nothing to do */;
8619 else if (PyString_Check(temp)) {
8620 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008621 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008623 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008625 Py_DECREF(temp);
8626 temp = unicode;
8627 if (temp == NULL)
8628 goto onError;
8629 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008630 else {
8631 Py_DECREF(temp);
8632 PyErr_SetString(PyExc_TypeError,
8633 "%s argument has non-string str()");
8634 goto onError;
8635 }
8636 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008637 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 len = PyUnicode_GET_SIZE(temp);
8639 if (prec >= 0 && len > prec)
8640 len = prec;
8641 break;
8642
8643 case 'i':
8644 case 'd':
8645 case 'u':
8646 case 'o':
8647 case 'x':
8648 case 'X':
8649 if (c == 'i')
8650 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008651 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008652 temp = formatlong(v, flags, prec, c);
8653 if (!temp)
8654 goto onError;
8655 pbuf = PyUnicode_AS_UNICODE(temp);
8656 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008657 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008659 else {
8660 pbuf = formatbuf;
8661 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8662 flags, prec, c, v);
8663 if (len < 0)
8664 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008665 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008666 }
8667 if (flags & F_ZERO)
8668 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 break;
8670
8671 case 'e':
8672 case 'E':
8673 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008674 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 case 'g':
8676 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008677 if (c == 'F')
8678 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008679 pbuf = formatbuf;
8680 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8681 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 if (len < 0)
8683 goto onError;
8684 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008685 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 fill = '0';
8687 break;
8688
8689 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008690 pbuf = formatbuf;
8691 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 if (len < 0)
8693 goto onError;
8694 break;
8695
8696 default:
8697 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008698 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008699 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008700 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008701 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008702 (Py_ssize_t)(fmt - 1 -
8703 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 goto onError;
8705 }
8706 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008707 if (*pbuf == '-' || *pbuf == '+') {
8708 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 len--;
8710 }
8711 else if (flags & F_SIGN)
8712 sign = '+';
8713 else if (flags & F_BLANK)
8714 sign = ' ';
8715 else
8716 sign = 0;
8717 }
8718 if (width < len)
8719 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008720 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 reslen -= rescnt;
8722 rescnt = width + fmtcnt + 100;
8723 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008724 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008725 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008726 PyErr_NoMemory();
8727 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008728 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008729 if (_PyUnicode_Resize(&result, reslen) < 0) {
8730 Py_XDECREF(temp);
8731 goto onError;
8732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 res = PyUnicode_AS_UNICODE(result)
8734 + reslen - rescnt;
8735 }
8736 if (sign) {
8737 if (fill != ' ')
8738 *res++ = sign;
8739 rescnt--;
8740 if (width > len)
8741 width--;
8742 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008743 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008744 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008745 assert(pbuf[1] == c);
8746 if (fill != ' ') {
8747 *res++ = *pbuf++;
8748 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008749 }
Tim Petersfff53252001-04-12 18:38:48 +00008750 rescnt -= 2;
8751 width -= 2;
8752 if (width < 0)
8753 width = 0;
8754 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 if (width > len && !(flags & F_LJUST)) {
8757 do {
8758 --rescnt;
8759 *res++ = fill;
8760 } while (--width > len);
8761 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008762 if (fill == ' ') {
8763 if (sign)
8764 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008765 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008766 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008767 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008768 *res++ = *pbuf++;
8769 *res++ = *pbuf++;
8770 }
8771 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008772 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 res += len;
8774 rescnt -= len;
8775 while (--width >= len) {
8776 --rescnt;
8777 *res++ = ' ';
8778 }
8779 if (dict && (argidx < arglen) && c != '%') {
8780 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008781 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008782 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783 goto onError;
8784 }
8785 Py_XDECREF(temp);
8786 } /* '%' */
8787 } /* until end */
8788 if (argidx < arglen && !dict) {
8789 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008790 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 goto onError;
8792 }
8793
Thomas Woutersa96affe2006-03-12 00:29:36 +00008794 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8795 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 if (args_owned) {
8797 Py_DECREF(args);
8798 }
8799 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 return (PyObject *)result;
8801
8802 onError:
8803 Py_XDECREF(result);
8804 Py_DECREF(uformat);
8805 if (args_owned) {
8806 Py_DECREF(args);
8807 }
8808 return NULL;
8809}
8810
8811static PyBufferProcs unicode_as_buffer = {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008812 (getbufferproc) unicode_buffer_getbuffer,
8813 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814};
8815
Jeremy Hylton938ace62002-07-17 16:30:39 +00008816static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008817unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8818
Tim Peters6d6c1a32001-08-02 04:15:00 +00008819static PyObject *
8820unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8821{
8822 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008823 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008824 char *encoding = NULL;
8825 char *errors = NULL;
8826
Guido van Rossume023fe02001-08-30 03:12:59 +00008827 if (type != &PyUnicode_Type)
8828 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008829 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8830 kwlist, &x, &encoding, &errors))
8831 return NULL;
8832 if (x == NULL)
8833 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008834 if (encoding == NULL && errors == NULL)
8835 return PyObject_Unicode(x);
8836 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008837 return PyUnicode_FromEncodedObject(x, encoding, errors);
8838}
8839
Guido van Rossume023fe02001-08-30 03:12:59 +00008840static PyObject *
8841unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8842{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008843 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008844 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008845
8846 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8847 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8848 if (tmp == NULL)
8849 return NULL;
8850 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008851 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008852 if (pnew == NULL) {
8853 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008854 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008855 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008856 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8857 if (pnew->str == NULL) {
8858 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008859 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008860 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008861 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008862 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008863 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8864 pnew->length = n;
8865 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008866 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008867 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008868}
8869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008870PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008871"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008872\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008873Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008874encoding defaults to the current default string encoding.\n\
8875errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008876
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008877static PyObject *unicode_iter(PyObject *seq);
8878
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008880 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008881 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 sizeof(PyUnicodeObject), /* tp_size */
8883 0, /* tp_itemsize */
8884 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008885 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008887 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008889 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008890 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008891 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008893 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894 (hashfunc) unicode_hash, /* tp_hash*/
8895 0, /* tp_call*/
8896 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008897 PyObject_GenericGetAttr, /* tp_getattro */
8898 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008900 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8901 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008902 unicode_doc, /* tp_doc */
8903 0, /* tp_traverse */
8904 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008905 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008906 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008907 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008908 0, /* tp_iternext */
8909 unicode_methods, /* tp_methods */
8910 0, /* tp_members */
8911 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008912 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008913 0, /* tp_dict */
8914 0, /* tp_descr_get */
8915 0, /* tp_descr_set */
8916 0, /* tp_dictoffset */
8917 0, /* tp_init */
8918 0, /* tp_alloc */
8919 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008920 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921};
8922
8923/* Initialize the Unicode implementation */
8924
Thomas Wouters78890102000-07-22 19:25:51 +00008925void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008927 int i;
8928
Thomas Wouters477c8d52006-05-27 19:21:47 +00008929 /* XXX - move this array to unicodectype.c ? */
8930 Py_UNICODE linebreak[] = {
8931 0x000A, /* LINE FEED */
8932 0x000D, /* CARRIAGE RETURN */
8933 0x001C, /* FILE SEPARATOR */
8934 0x001D, /* GROUP SEPARATOR */
8935 0x001E, /* RECORD SEPARATOR */
8936 0x0085, /* NEXT LINE */
8937 0x2028, /* LINE SEPARATOR */
8938 0x2029, /* PARAGRAPH SEPARATOR */
8939 };
8940
Fred Drakee4315f52000-05-09 19:53:39 +00008941 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008942 unicode_freelist = NULL;
8943 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008945 if (!unicode_empty)
8946 return;
8947
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008948 for (i = 0; i < 256; i++)
8949 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008950 if (PyType_Ready(&PyUnicode_Type) < 0)
8951 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008952
8953 /* initialize the linebreak bloom filter */
8954 bloom_linebreak = make_bloom_mask(
8955 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8956 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008957
8958 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959}
8960
8961/* Finalize the Unicode implementation */
8962
8963void
Thomas Wouters78890102000-07-22 19:25:51 +00008964_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008966 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008967 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008969 Py_XDECREF(unicode_empty);
8970 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008971
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008972 for (i = 0; i < 256; i++) {
8973 if (unicode_latin1[i]) {
8974 Py_DECREF(unicode_latin1[i]);
8975 unicode_latin1[i] = NULL;
8976 }
8977 }
8978
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008979 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 PyUnicodeObject *v = u;
8981 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008982 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008983 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008984 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008985 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008987 unicode_freelist = NULL;
8988 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008990
Walter Dörwald16807132007-05-25 13:52:07 +00008991void
8992PyUnicode_InternInPlace(PyObject **p)
8993{
8994 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8995 PyObject *t;
8996 if (s == NULL || !PyUnicode_Check(s))
8997 Py_FatalError(
8998 "PyUnicode_InternInPlace: unicode strings only please!");
8999 /* If it's a subclass, we don't really know what putting
9000 it in the interned dict might do. */
9001 if (!PyUnicode_CheckExact(s))
9002 return;
9003 if (PyUnicode_CHECK_INTERNED(s))
9004 return;
9005 if (interned == NULL) {
9006 interned = PyDict_New();
9007 if (interned == NULL) {
9008 PyErr_Clear(); /* Don't leave an exception */
9009 return;
9010 }
9011 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009012 /* It might be that the GetItem call fails even
9013 though the key is present in the dictionary,
9014 namely when this happens during a stack overflow. */
9015 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009016 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009017 Py_END_ALLOW_RECURSION
9018
Walter Dörwald16807132007-05-25 13:52:07 +00009019 if (t) {
9020 Py_INCREF(t);
9021 Py_DECREF(*p);
9022 *p = t;
9023 return;
9024 }
9025
Martin v. Löwis5b222132007-06-10 09:51:05 +00009026 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009027 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9028 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009029 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009030 return;
9031 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009032 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009033 /* The two references in interned are not counted by refcnt.
9034 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009035 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009036 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9037}
9038
9039void
9040PyUnicode_InternImmortal(PyObject **p)
9041{
9042 PyUnicode_InternInPlace(p);
9043 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9044 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9045 Py_INCREF(*p);
9046 }
9047}
9048
9049PyObject *
9050PyUnicode_InternFromString(const char *cp)
9051{
9052 PyObject *s = PyUnicode_FromString(cp);
9053 if (s == NULL)
9054 return NULL;
9055 PyUnicode_InternInPlace(&s);
9056 return s;
9057}
9058
9059void _Py_ReleaseInternedUnicodeStrings(void)
9060{
9061 PyObject *keys;
9062 PyUnicodeObject *s;
9063 Py_ssize_t i, n;
9064 Py_ssize_t immortal_size = 0, mortal_size = 0;
9065
9066 if (interned == NULL || !PyDict_Check(interned))
9067 return;
9068 keys = PyDict_Keys(interned);
9069 if (keys == NULL || !PyList_Check(keys)) {
9070 PyErr_Clear();
9071 return;
9072 }
9073
9074 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9075 detector, interned unicode strings are not forcibly deallocated;
9076 rather, we give them their stolen references back, and then clear
9077 and DECREF the interned dict. */
9078
9079 n = PyList_GET_SIZE(keys);
9080 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9081 n);
9082 for (i = 0; i < n; i++) {
9083 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9084 switch (s->state) {
9085 case SSTATE_NOT_INTERNED:
9086 /* XXX Shouldn't happen */
9087 break;
9088 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009089 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009090 immortal_size += s->length;
9091 break;
9092 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009093 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009094 mortal_size += s->length;
9095 break;
9096 default:
9097 Py_FatalError("Inconsistent interned string state.");
9098 }
9099 s->state = SSTATE_NOT_INTERNED;
9100 }
9101 fprintf(stderr, "total size of all interned strings: "
9102 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9103 "mortal/immortal\n", mortal_size, immortal_size);
9104 Py_DECREF(keys);
9105 PyDict_Clear(interned);
9106 Py_DECREF(interned);
9107 interned = NULL;
9108}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009109
9110
9111/********************* Unicode Iterator **************************/
9112
9113typedef struct {
9114 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009115 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009116 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9117} unicodeiterobject;
9118
9119static void
9120unicodeiter_dealloc(unicodeiterobject *it)
9121{
9122 _PyObject_GC_UNTRACK(it);
9123 Py_XDECREF(it->it_seq);
9124 PyObject_GC_Del(it);
9125}
9126
9127static int
9128unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9129{
9130 Py_VISIT(it->it_seq);
9131 return 0;
9132}
9133
9134static PyObject *
9135unicodeiter_next(unicodeiterobject *it)
9136{
9137 PyUnicodeObject *seq;
9138 PyObject *item;
9139
9140 assert(it != NULL);
9141 seq = it->it_seq;
9142 if (seq == NULL)
9143 return NULL;
9144 assert(PyUnicode_Check(seq));
9145
9146 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009147 item = PyUnicode_FromUnicode(
9148 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009149 if (item != NULL)
9150 ++it->it_index;
9151 return item;
9152 }
9153
9154 Py_DECREF(seq);
9155 it->it_seq = NULL;
9156 return NULL;
9157}
9158
9159static PyObject *
9160unicodeiter_len(unicodeiterobject *it)
9161{
9162 Py_ssize_t len = 0;
9163 if (it->it_seq)
9164 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9165 return PyInt_FromSsize_t(len);
9166}
9167
9168PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9169
9170static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009171 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9172 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009173 {NULL, NULL} /* sentinel */
9174};
9175
9176PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009177 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009178 "unicodeiterator", /* tp_name */
9179 sizeof(unicodeiterobject), /* tp_basicsize */
9180 0, /* tp_itemsize */
9181 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009182 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009183 0, /* tp_print */
9184 0, /* tp_getattr */
9185 0, /* tp_setattr */
9186 0, /* tp_compare */
9187 0, /* tp_repr */
9188 0, /* tp_as_number */
9189 0, /* tp_as_sequence */
9190 0, /* tp_as_mapping */
9191 0, /* tp_hash */
9192 0, /* tp_call */
9193 0, /* tp_str */
9194 PyObject_GenericGetAttr, /* tp_getattro */
9195 0, /* tp_setattro */
9196 0, /* tp_as_buffer */
9197 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9198 0, /* tp_doc */
9199 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9200 0, /* tp_clear */
9201 0, /* tp_richcompare */
9202 0, /* tp_weaklistoffset */
9203 PyObject_SelfIter, /* tp_iter */
9204 (iternextfunc)unicodeiter_next, /* tp_iternext */
9205 unicodeiter_methods, /* tp_methods */
9206 0,
9207};
9208
9209static PyObject *
9210unicode_iter(PyObject *seq)
9211{
9212 unicodeiterobject *it;
9213
9214 if (!PyUnicode_Check(seq)) {
9215 PyErr_BadInternalCall();
9216 return NULL;
9217 }
9218 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9219 if (it == NULL)
9220 return NULL;
9221 it->it_index = 0;
9222 Py_INCREF(seq);
9223 it->it_seq = (PyUnicodeObject *)seq;
9224 _PyObject_GC_TRACK(it);
9225 return (PyObject *)it;
9226}
9227
Martin v. Löwis5b222132007-06-10 09:51:05 +00009228size_t
9229Py_UNICODE_strlen(const Py_UNICODE *u)
9230{
9231 int res = 0;
9232 while(*u++)
9233 res++;
9234 return res;
9235}
9236
9237Py_UNICODE*
9238Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9239{
9240 Py_UNICODE *u = s1;
9241 while ((*u++ = *s2++));
9242 return s1;
9243}
9244
9245Py_UNICODE*
9246Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9247{
9248 Py_UNICODE *u = s1;
9249 while ((*u++ = *s2++))
9250 if (n-- == 0)
9251 break;
9252 return s1;
9253}
9254
9255int
9256Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9257{
9258 while (*s1 && *s2 && *s1 == *s2)
9259 s1++, s2++;
9260 if (*s1 && *s2)
9261 return (*s1 < *s2) ? -1 : +1;
9262 if (*s1)
9263 return 1;
9264 if (*s2)
9265 return -1;
9266 return 0;
9267}
9268
9269Py_UNICODE*
9270Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9271{
9272 const Py_UNICODE *p;
9273 for (p = s; *p; p++)
9274 if (*p == c)
9275 return (Py_UNICODE*)p;
9276 return NULL;
9277}
9278
9279
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009280#ifdef __cplusplus
9281}
9282#endif
9283
9284
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009285/*
9286Local variables:
9287c-basic-offset: 4
9288indent-tabs-mode: nil
9289End:
9290*/