blob: 9ea9a886285edfcbb1cbc25d58391d53e68d84e5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
Christian Heimes2202f872008-02-06 14:31:34 +000057#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Christian Heimes2202f872008-02-06 14:31:34 +000065 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000109static PyUnicodeObject *free_list;
110static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes190d79e2008-01-30 11:58:22 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131// case 0x0009: /* HORIZONTAL TABULATION */
132// case 0x000A: /* LINE FEED */
133// case 0x000B: /* VERTICAL TABULATION */
134// case 0x000C: /* FORM FEED */
135// case 0x000D: /* CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138// case 0x001C: /* FILE SEPARATOR */
139// case 0x001D: /* GROUP SEPARATOR */
140// case 0x001E: /* RECORD SEPARATOR */
141// case 0x001F: /* UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143// case 0x0020: /* SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162// 0x000A, /* LINE FEED */
163// 0x000D, /* CARRIAGE RETURN */
164 0, 0, 1, 0, 0, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166// 0x001C, /* FILE SEPARATOR */
167// 0x001D, /* GROUP SEPARATOR */
168// 0x001E, /* RECORD SEPARATOR */
169 0, 0, 0, 0, 1, 1, 1, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0
183};
184
185
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000187PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000189#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190 return 0x10FFFF;
191#else
192 /* This is actually an illegal character, so it should
193 not be passed to unichr. */
194 return 0xFFFF;
195#endif
196}
197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198/* --- Bloom Filters ----------------------------------------------------- */
199
200/* stuff to implement simple "bloom filters" for Unicode characters.
201 to keep things simple, we use a single bitmask, using the least 5
202 bits from each unicode characters as the bit index. */
203
204/* the linebreak mask is set up by Unicode_Init below */
205
206#define BLOOM_MASK unsigned long
207
208static BLOOM_MASK bloom_linebreak;
209
210#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
211
Christian Heimes190d79e2008-01-30 11:58:22 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
220 long mask;
221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
225 mask |= (1 << (ptr[i] & 0x1F));
226
227 return mask;
228}
229
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
238 return 0;
239}
240
241#define BLOOM_MEMBER(mask, chr, set, setlen)\
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 if (unicode == unicode_empty ||
261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000297 Ux0000 terminated; some code (e.g. new_identifier)
298 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299
300 XXX This allocator could further be enhanced by assuring that the
301 free list never reduces its size below 1.
302
303*/
304
305static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000306PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307{
308 register PyUnicodeObject *unicode;
309
Thomas Wouters477c8d52006-05-27 19:21:47 +0000310 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 if (length == 0 && unicode_empty != NULL) {
312 Py_INCREF(unicode_empty);
313 return unicode_empty;
314 }
315
316 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000317 if (free_list) {
318 unicode = free_list;
319 free_list = *(PyUnicodeObject **)unicode;
320 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000322 /* Keep-Alive optimization: we only upsize the buffer,
323 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000324 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000325 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000326 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328 }
329 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000330 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000331 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
332 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 }
334 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 }
336 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000337 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000338 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 if (unicode == NULL)
340 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000341 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
342 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000345 if (!unicode->str) {
346 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000347 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000349 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000350 * the caller fails before initializing str -- unicode_resize()
351 * reads str[0], and the Keep-Alive optimization can keep memory
352 * allocated for str alive across a call to unicode_dealloc(unicode).
353 * We don't want unicode_resize to read uninitialized memory in
354 * that case.
355 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000356 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000360 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000361 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000363
364 onError:
365 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000366 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368}
369
370static
Guido van Rossum9475a232001-10-05 20:51:39 +0000371void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372{
Walter Dörwald16807132007-05-25 13:52:07 +0000373 switch (PyUnicode_CHECK_INTERNED(unicode)) {
374 case SSTATE_NOT_INTERNED:
375 break;
376
377 case SSTATE_INTERNED_MORTAL:
378 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000379 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000380 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
381 Py_FatalError(
382 "deletion of interned unicode string failed");
383 break;
384
385 case SSTATE_INTERNED_IMMORTAL:
386 Py_FatalError("Immortal interned unicode string died.");
387
388 default:
389 Py_FatalError("Inconsistent interned unicode string state.");
390 }
391
Guido van Rossum604ddf82001-12-06 20:03:56 +0000392 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000393 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000394 /* Keep-Alive optimization */
395 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000396 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 unicode->str = NULL;
398 unicode->length = 0;
399 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000400 if (unicode->defenc) {
401 Py_DECREF(unicode->defenc);
402 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000403 }
404 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000405 *(PyUnicodeObject **)unicode = free_list;
406 free_list = unicode;
407 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000410 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000411 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000412 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414}
415
Martin v. Löwis18e16552006-02-15 17:27:45 +0000416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000417{
418 register PyUnicodeObject *v;
419
420 /* Argument checks */
421 if (unicode == NULL) {
422 PyErr_BadInternalCall();
423 return -1;
424 }
425 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000426 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 PyErr_BadInternalCall();
428 return -1;
429 }
430
431 /* Resizing unicode_empty and single character objects is not
432 possible since these are being shared. We simply return a fresh
433 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000434 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 (v == unicode_empty || v->length == 1)) {
436 PyUnicodeObject *w = _PyUnicode_New(length);
437 if (w == NULL)
438 return -1;
439 Py_UNICODE_COPY(w->str, v->str,
440 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000441 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 *unicode = (PyObject *)w;
443 return 0;
444 }
445
446 /* Note that we don't have to modify *unicode for unshared Unicode
447 objects, since we can modify them in-place. */
448 return unicode_resize(v, length);
449}
450
451/* Internal API for use in unicodeobject.c only ! */
452#define _PyUnicode_Resize(unicodevar, length) \
453 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000456 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457{
458 PyUnicodeObject *unicode;
459
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 /* If the Unicode data is known at construction time, we can apply
461 some optimizations which share commonly used objects. */
462 if (u != NULL) {
463
464 /* Optimization for empty strings */
465 if (size == 0 && unicode_empty != NULL) {
466 Py_INCREF(unicode_empty);
467 return (PyObject *)unicode_empty;
468 }
469
470 /* Single character Unicode objects in the Latin-1 range are
471 shared when using this constructor */
472 if (size == 1 && *u < 256) {
473 unicode = unicode_latin1[*u];
474 if (!unicode) {
475 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476 if (!unicode)
477 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000478 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479 unicode_latin1[*u] = unicode;
480 }
481 Py_INCREF(unicode);
482 return (PyObject *)unicode;
483 }
484 }
Tim Petersced69f82003-09-16 20:30:58 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 unicode = _PyUnicode_New(size);
487 if (!unicode)
488 return NULL;
489
490 /* Copy the Unicode data into the new object */
491 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000492 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493
494 return (PyObject *)unicode;
495}
496
Walter Dörwaldd2034312007-05-18 16:29:38 +0000497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000498{
499 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000500
501 if (size < 0) {
502 PyErr_SetString(PyExc_SystemError,
503 "Negative size passed to PyUnicode_FromStringAndSize");
504 return NULL;
505 }
506
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000507 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000508 some optimizations which share commonly used objects.
509 Also, this means the input must be UTF-8, so fall back to the
510 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000511 if (u != NULL) {
512
513 /* Optimization for empty strings */
514 if (size == 0 && unicode_empty != NULL) {
515 Py_INCREF(unicode_empty);
516 return (PyObject *)unicode_empty;
517 }
518
Martin v. Löwis9c121062007-08-05 20:26:11 +0000519 /* Single characters are shared when using this constructor.
520 Restrict to ASCII, since the input must be UTF-8. */
521 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000522 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (!unicode) {
524 unicode = _PyUnicode_New(1);
525 if (!unicode)
526 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000527 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000528 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529 }
530 Py_INCREF(unicode);
531 return (PyObject *)unicode;
532 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000533
534 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000535 }
536
Walter Dörwald55507312007-05-18 13:12:10 +0000537 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 if (!unicode)
539 return NULL;
540
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000541 return (PyObject *)unicode;
542}
543
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544PyObject *PyUnicode_FromString(const char *u)
545{
546 size_t size = strlen(u);
547 if (size > PY_SSIZE_T_MAX) {
548 PyErr_SetString(PyExc_OverflowError, "input too long");
549 return NULL;
550 }
551
552 return PyUnicode_FromStringAndSize(u, size);
553}
554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555#ifdef HAVE_WCHAR_H
556
557PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000558 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559{
560 PyUnicodeObject *unicode;
561
562 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000563 if (size == 0)
564 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 PyErr_BadInternalCall();
566 return NULL;
567 }
568
Martin v. Löwis790465f2008-04-05 20:41:37 +0000569 if (size == -1) {
570 size = wcslen(w);
571 }
572
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 unicode = _PyUnicode_New(size);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578#ifdef HAVE_USABLE_WCHAR_T
579 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000580#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 {
582 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000583 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000585 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 *u++ = *w++;
587 }
588#endif
589
590 return (PyObject *)unicode;
591}
592
Walter Dörwald346737f2007-05-31 10:44:43 +0000593static void
594makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
595{
596 *fmt++ = '%';
597 if (width) {
598 if (zeropad)
599 *fmt++ = '0';
600 fmt += sprintf(fmt, "%d", width);
601 }
602 if (precision)
603 fmt += sprintf(fmt, ".%d", precision);
604 if (longflag)
605 *fmt++ = 'l';
606 else if (size_tflag) {
607 char *f = PY_FORMAT_SIZE_T;
608 while (*f)
609 *fmt++ = *f++;
610 }
611 *fmt++ = c;
612 *fmt = '\0';
613}
614
Walter Dörwaldd2034312007-05-18 16:29:38 +0000615#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
616
617PyObject *
618PyUnicode_FromFormatV(const char *format, va_list vargs)
619{
620 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000621 Py_ssize_t callcount = 0;
622 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000623 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000624 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000625 int width = 0;
626 int precision = 0;
627 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000628 const char* f;
629 Py_UNICODE *s;
630 PyObject *string;
631 /* used by sprintf */
632 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000633 /* use abuffer instead of buffer, if we need more space
634 * (which can happen if there's a format specifier with width). */
635 char *abuffer = NULL;
636 char *realbuffer;
637 Py_ssize_t abuffersize = 0;
638 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000639 const char *copy;
640
641#ifdef VA_LIST_IS_ARRAY
642 Py_MEMCPY(count, vargs, sizeof(va_list));
643#else
644#ifdef __va_copy
645 __va_copy(count, vargs);
646#else
647 count = vargs;
648#endif
649#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000650 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000651 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000652 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000653 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000654 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000655 ++callcount;
656 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000657 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000658 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000660 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 if (!callresults) {
662 PyErr_NoMemory();
663 return NULL;
664 }
665 callresult = callresults;
666 }
667 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000668 for (f = format; *f; f++) {
669 if (*f == '%') {
670 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000671 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000672 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000673 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000674 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000675 ;
676
677 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
678 * they don't affect the amount of space we reserve.
679 */
680 if ((*f == 'l' || *f == 'z') &&
681 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000682 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000683
684 switch (*f) {
685 case 'c':
686 (void)va_arg(count, int);
687 /* fall through... */
688 case '%':
689 n++;
690 break;
691 case 'd': case 'u': case 'i': case 'x':
692 (void) va_arg(count, int);
693 /* 20 bytes is enough to hold a 64-bit
694 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000695 This isn't enough for octal.
696 If a width is specified we need more
697 (which we allocate later). */
698 if (width < 20)
699 width = 20;
700 n += width;
701 if (abuffersize < width)
702 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000703 break;
704 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000705 {
706 /* UTF-8 */
707 unsigned char*s;
708 s = va_arg(count, unsigned char*);
709 while (*s) {
710 if (*s < 128) {
711 n++; s++;
712 } else if (*s < 0xc0) {
713 /* invalid UTF-8 */
714 n++; s++;
715 } else if (*s < 0xc0) {
716 n++;
717 s++; if(!*s)break;
718 s++;
719 } else if (*s < 0xe0) {
720 n++;
721 s++; if(!*s)break;
722 s++; if(!*s)break;
723 s++;
724 } else {
725 #ifdef Py_UNICODE_WIDE
726 n++;
727 #else
728 n+=2;
729 #endif
730 s++; if(!*s)break;
731 s++; if(!*s)break;
732 s++; if(!*s)break;
733 s++;
734 }
735 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000737 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000738 case 'U':
739 {
740 PyObject *obj = va_arg(count, PyObject *);
741 assert(obj && PyUnicode_Check(obj));
742 n += PyUnicode_GET_SIZE(obj);
743 break;
744 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000745 case 'V':
746 {
747 PyObject *obj = va_arg(count, PyObject *);
748 const char *str = va_arg(count, const char *);
749 assert(obj || str);
750 assert(!obj || PyUnicode_Check(obj));
751 if (obj)
752 n += PyUnicode_GET_SIZE(obj);
753 else
754 n += strlen(str);
755 break;
756 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000757 case 'S':
758 {
759 PyObject *obj = va_arg(count, PyObject *);
760 PyObject *str;
761 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000762 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000763 if (!str)
764 goto fail;
765 n += PyUnicode_GET_SIZE(str);
766 /* Remember the str and switch to the next slot */
767 *callresult++ = str;
768 break;
769 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000770 case 'R':
771 {
772 PyObject *obj = va_arg(count, PyObject *);
773 PyObject *repr;
774 assert(obj);
775 repr = PyObject_Repr(obj);
776 if (!repr)
777 goto fail;
778 n += PyUnicode_GET_SIZE(repr);
779 /* Remember the repr and switch to the next slot */
780 *callresult++ = repr;
781 break;
782 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000783 case 'p':
784 (void) va_arg(count, int);
785 /* maximum 64-bit pointer representation:
786 * 0xffffffffffffffff
787 * so 19 characters is enough.
788 * XXX I count 18 -- what's the extra for?
789 */
790 n += 19;
791 break;
792 default:
793 /* if we stumble upon an unknown
794 formatting code, copy the rest of
795 the format string to the output
796 string. (we cannot just skip the
797 code, since there's no way to know
798 what's in the argument list) */
799 n += strlen(p);
800 goto expand;
801 }
802 } else
803 n++;
804 }
805 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000806 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000807 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000808 if (!abuffer) {
809 PyErr_NoMemory();
810 goto fail;
811 }
812 realbuffer = abuffer;
813 }
814 else
815 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000818 we don't have to resize the string.
819 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820 string = PyUnicode_FromUnicode(NULL, n);
821 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000822 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000823
824 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000825 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000826
827 for (f = format; *f; f++) {
828 if (*f == '%') {
829 const char* p = f++;
830 int longflag = 0;
831 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000832 zeropad = (*f == '0');
833 /* parse the width.precision part */
834 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000835 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000836 width = (width*10) + *f++ - '0';
837 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000838 if (*f == '.') {
839 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000840 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000841 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000842 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000843 /* handle the long flag, but only for %ld and %lu.
844 others can be added when necessary. */
845 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
846 longflag = 1;
847 ++f;
848 }
849 /* handle the size_t flag. */
850 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
851 size_tflag = 1;
852 ++f;
853 }
854
855 switch (*f) {
856 case 'c':
857 *s++ = va_arg(vargs, int);
858 break;
859 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000862 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000864 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000865 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000866 sprintf(realbuffer, fmt, va_arg(vargs, int));
867 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000868 break;
869 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000870 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000871 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000872 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000873 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000874 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000875 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000876 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
877 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000878 break;
879 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000880 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
881 sprintf(realbuffer, fmt, va_arg(vargs, int));
882 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000883 break;
884 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
886 sprintf(realbuffer, fmt, va_arg(vargs, int));
887 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000888 break;
889 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000890 {
891 /* Parameter must be UTF-8 encoded.
892 In case of encoding errors, use
893 the replacement character. */
894 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000895 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000896 u = PyUnicode_DecodeUTF8(p, strlen(p),
897 "replace");
898 if (!u)
899 goto fail;
900 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
901 PyUnicode_GET_SIZE(u));
902 s += PyUnicode_GET_SIZE(u);
903 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000905 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000906 case 'U':
907 {
908 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000909 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
910 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
911 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000912 break;
913 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000914 case 'V':
915 {
916 PyObject *obj = va_arg(vargs, PyObject *);
917 const char *str = va_arg(vargs, const char *);
918 if (obj) {
919 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
920 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
921 s += size;
922 } else {
923 appendstring(str);
924 }
925 break;
926 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000927 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000928 case 'R':
929 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000930 Py_UNICODE *ucopy;
931 Py_ssize_t usize;
932 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000933 /* unused, since we already have the result */
934 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000935 ucopy = PyUnicode_AS_UNICODE(*callresult);
936 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000937 for (upos = 0; upos<usize;)
938 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000939 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000940 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000941 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000942 ++callresult;
943 break;
944 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000945 case 'p':
946 sprintf(buffer, "%p", va_arg(vargs, void*));
947 /* %p is ill-defined: ensure leading 0x. */
948 if (buffer[1] == 'X')
949 buffer[1] = 'x';
950 else if (buffer[1] != 'x') {
951 memmove(buffer+2, buffer, strlen(buffer)+1);
952 buffer[0] = '0';
953 buffer[1] = 'x';
954 }
955 appendstring(buffer);
956 break;
957 case '%':
958 *s++ = '%';
959 break;
960 default:
961 appendstring(p);
962 goto end;
963 }
964 } else
965 *s++ = *f;
966 }
967
968 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000969 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000970 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000971 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000972 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000973 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
974 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000975 fail:
976 if (callresults) {
977 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000978 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000979 Py_DECREF(*callresult2);
980 ++callresult2;
981 }
Christian Heimesb186d002008-03-18 15:15:01 +0000982 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000983 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000984 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000985 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000986 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000987}
988
989#undef appendstring
990
991PyObject *
992PyUnicode_FromFormat(const char *format, ...)
993{
994 PyObject* ret;
995 va_list vargs;
996
997#ifdef HAVE_STDARG_PROTOTYPES
998 va_start(vargs, format);
999#else
1000 va_start(vargs);
1001#endif
1002 ret = PyUnicode_FromFormatV(format, vargs);
1003 va_end(vargs);
1004 return ret;
1005}
1006
Martin v. Löwis18e16552006-02-15 17:27:45 +00001007Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1008 wchar_t *w,
1009 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001010{
1011 if (unicode == NULL) {
1012 PyErr_BadInternalCall();
1013 return -1;
1014 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001015
1016 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001018 size = PyUnicode_GET_SIZE(unicode) + 1;
1019
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020#ifdef HAVE_USABLE_WCHAR_T
1021 memcpy(w, unicode->str, size * sizeof(wchar_t));
1022#else
1023 {
1024 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001025 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001027 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 *w++ = *u++;
1029 }
1030#endif
1031
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001032 if (size > PyUnicode_GET_SIZE(unicode))
1033 return PyUnicode_GET_SIZE(unicode);
1034 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001035 return size;
1036}
1037
1038#endif
1039
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001040PyObject *PyUnicode_FromOrdinal(int ordinal)
1041{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001042 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001043
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001044 if (ordinal < 0 || ordinal > 0x10ffff) {
1045 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001046 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001047 return NULL;
1048 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001049
1050#ifndef Py_UNICODE_WIDE
1051 if (ordinal > 0xffff) {
1052 ordinal -= 0x10000;
1053 s[0] = 0xD800 | (ordinal >> 10);
1054 s[1] = 0xDC00 | (ordinal & 0x3FF);
1055 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001056 }
1057#endif
1058
Hye-Shik Chang40574832004-04-06 07:24:51 +00001059 s[0] = (Py_UNICODE)ordinal;
1060 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061}
1062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063PyObject *PyUnicode_FromObject(register PyObject *obj)
1064{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001066 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001067 if (PyUnicode_CheckExact(obj)) {
1068 Py_INCREF(obj);
1069 return obj;
1070 }
1071 if (PyUnicode_Check(obj)) {
1072 /* For a Unicode subtype that's not a Unicode object,
1073 return a true Unicode object with the same data. */
1074 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1075 PyUnicode_GET_SIZE(obj));
1076 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001077 PyErr_Format(PyExc_TypeError,
1078 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001079 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001080 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001081}
1082
1083PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1084 const char *encoding,
1085 const char *errors)
1086{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001087 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001089 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001090
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 if (obj == NULL) {
1092 PyErr_BadInternalCall();
1093 return NULL;
1094 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001096 if (PyUnicode_Check(obj)) {
1097 PyErr_SetString(PyExc_TypeError,
1098 "decoding Unicode is not supported");
1099 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001101
1102 /* Coerce object */
1103 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001104 s = PyString_AS_STRING(obj);
1105 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001106 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001107 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1108 /* Overwrite the error message with something more useful in
1109 case of a TypeError. */
1110 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001111 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001112 "coercing to Unicode: need string or buffer, "
1113 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001114 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001115 goto onError;
1116 }
Tim Petersced69f82003-09-16 20:30:58 +00001117
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001118 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 if (len == 0) {
1120 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001121 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 }
Tim Petersced69f82003-09-16 20:30:58 +00001123 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001124 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001125
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001126 return v;
1127
1128 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130}
1131
1132PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001133 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 const char *encoding,
1135 const char *errors)
1136{
1137 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001138 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001139 char lower[20]; /* Enough for any encoding name we recognize */
1140 char *l;
1141 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001142
1143 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001144 encoding = PyUnicode_GetDefaultEncoding();
1145
1146 /* Convert encoding to lower case and replace '_' with '-' in order to
1147 catch e.g. UTF_8 */
1148 e = encoding;
1149 l = lower;
1150 while (*e && l < &lower[(sizeof lower) - 2]) {
1151 if (ISUPPER(*e)) {
1152 *l++ = TOLOWER(*e++);
1153 }
1154 else if (*e == '_') {
1155 *l++ = '-';
1156 e++;
1157 }
1158 else {
1159 *l++ = *e++;
1160 }
1161 }
1162 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001163
1164 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001165 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001167 else if ((strcmp(lower, "latin-1") == 0) ||
1168 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001169 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001170#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001171 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001172 return PyUnicode_DecodeMBCS(s, size, errors);
1173#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001174 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001175 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001176 else if (strcmp(lower, "utf-16") == 0)
1177 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1178 else if (strcmp(lower, "utf-32") == 0)
1179 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180
1181 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001182 buffer = NULL;
1183 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1184 goto onError;
1185 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186 if (buffer == NULL)
1187 goto onError;
1188 unicode = PyCodec_Decode(buffer, encoding, errors);
1189 if (unicode == NULL)
1190 goto onError;
1191 if (!PyUnicode_Check(unicode)) {
1192 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001193 "decoder did not return an unicode object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001194 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 Py_DECREF(unicode);
1196 goto onError;
1197 }
1198 Py_DECREF(buffer);
1199 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001200
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201 onError:
1202 Py_XDECREF(buffer);
1203 return NULL;
1204}
1205
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001206PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1207 const char *encoding,
1208 const char *errors)
1209{
1210 PyObject *v;
1211
1212 if (!PyUnicode_Check(unicode)) {
1213 PyErr_BadArgument();
1214 goto onError;
1215 }
1216
1217 if (encoding == NULL)
1218 encoding = PyUnicode_GetDefaultEncoding();
1219
1220 /* Decode via the codec registry */
1221 v = PyCodec_Decode(unicode, encoding, errors);
1222 if (v == NULL)
1223 goto onError;
1224 return v;
1225
1226 onError:
1227 return NULL;
1228}
1229
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001231 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 const char *encoding,
1233 const char *errors)
1234{
1235 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001236
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237 unicode = PyUnicode_FromUnicode(s, size);
1238 if (unicode == NULL)
1239 return NULL;
1240 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1241 Py_DECREF(unicode);
1242 return v;
1243}
1244
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001245PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1246 const char *encoding,
1247 const char *errors)
1248{
1249 PyObject *v;
1250
1251 if (!PyUnicode_Check(unicode)) {
1252 PyErr_BadArgument();
1253 goto onError;
1254 }
1255
1256 if (encoding == NULL)
1257 encoding = PyUnicode_GetDefaultEncoding();
1258
1259 /* Encode via the codec registry */
1260 v = PyCodec_Encode(unicode, encoding, errors);
1261 if (v == NULL)
1262 goto onError;
1263 return v;
1264
1265 onError:
1266 return NULL;
1267}
1268
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1270 const char *encoding,
1271 const char *errors)
1272{
1273 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001274
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 if (!PyUnicode_Check(unicode)) {
1276 PyErr_BadArgument();
1277 goto onError;
1278 }
Fred Drakee4315f52000-05-09 19:53:39 +00001279
Tim Petersced69f82003-09-16 20:30:58 +00001280 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001281 encoding = PyUnicode_GetDefaultEncoding();
1282
1283 /* Shortcuts for common default encodings */
1284 if (errors == NULL) {
1285 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001286 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001287 else if (strcmp(encoding, "latin-1") == 0)
1288 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001289#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1290 else if (strcmp(encoding, "mbcs") == 0)
1291 return PyUnicode_AsMBCSString(unicode);
1292#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001293 else if (strcmp(encoding, "ascii") == 0)
1294 return PyUnicode_AsASCIIString(unicode);
1295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296
1297 /* Encode via the codec registry */
1298 v = PyCodec_Encode(unicode, encoding, errors);
1299 if (v == NULL)
1300 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001301 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001303
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 onError:
1305 return NULL;
1306}
1307
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001308PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1309 const char *errors)
1310{
1311 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001312 if (v)
1313 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001314 if (errors != NULL)
1315 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001316 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001317 PyUnicode_GET_SIZE(unicode),
1318 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001319 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001320 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001321 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001322 return v;
1323}
1324
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001325PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001326PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001327 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001328 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1329}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001330
Christian Heimes5894ba72007-11-04 11:43:14 +00001331PyObject*
1332PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1333{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001334 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1335 can be undefined. If it is case, decode using UTF-8. The following assumes
1336 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1337 bootstrapping process where the codecs aren't ready yet.
1338 */
1339 if (Py_FileSystemDefaultEncoding) {
1340#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001341 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001342 return PyUnicode_DecodeMBCS(s, size, "replace");
1343 }
1344#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001345 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001346 return PyUnicode_DecodeUTF8(s, size, "replace");
1347 }
1348#endif
1349 return PyUnicode_Decode(s, size,
1350 Py_FileSystemDefaultEncoding,
1351 "replace");
1352 }
1353 else {
1354 return PyUnicode_DecodeUTF8(s, size, "replace");
1355 }
1356}
1357
Martin v. Löwis5b222132007-06-10 09:51:05 +00001358char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001359PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001360{
Christian Heimesf3863112007-11-22 07:46:41 +00001361 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001362 if (!PyUnicode_Check(unicode)) {
1363 PyErr_BadArgument();
1364 return NULL;
1365 }
Christian Heimesf3863112007-11-22 07:46:41 +00001366 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1367 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001368 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001369 if (psize != NULL)
Christian Heimesf3863112007-11-22 07:46:41 +00001370 *psize = PyString_GET_SIZE(bytes);
1371 return PyString_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001372}
1373
1374char*
1375PyUnicode_AsString(PyObject *unicode)
1376{
1377 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001378}
1379
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1381{
1382 if (!PyUnicode_Check(unicode)) {
1383 PyErr_BadArgument();
1384 goto onError;
1385 }
1386 return PyUnicode_AS_UNICODE(unicode);
1387
1388 onError:
1389 return NULL;
1390}
1391
Martin v. Löwis18e16552006-02-15 17:27:45 +00001392Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393{
1394 if (!PyUnicode_Check(unicode)) {
1395 PyErr_BadArgument();
1396 goto onError;
1397 }
1398 return PyUnicode_GET_SIZE(unicode);
1399
1400 onError:
1401 return -1;
1402}
1403
Thomas Wouters78890102000-07-22 19:25:51 +00001404const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001405{
1406 return unicode_default_encoding;
1407}
1408
1409int PyUnicode_SetDefaultEncoding(const char *encoding)
1410{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001411 if (strcmp(encoding, unicode_default_encoding) != 0) {
1412 PyErr_Format(PyExc_ValueError,
1413 "Can only set default encoding to %s",
1414 unicode_default_encoding);
1415 return -1;
1416 }
Fred Drakee4315f52000-05-09 19:53:39 +00001417 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001418}
1419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420/* error handling callback helper:
1421 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001422 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001423 and adjust various state variables.
1424 return 0 on success, -1 on error
1425*/
1426
1427static
1428int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1429 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001430 const char **input, const char **inend, Py_ssize_t *startinpos,
1431 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001434 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435
1436 PyObject *restuple = NULL;
1437 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001438 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001439 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001440 Py_ssize_t requiredsize;
1441 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001443 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001444 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 int res = -1;
1446
1447 if (*errorHandler == NULL) {
1448 *errorHandler = PyCodec_LookupError(errors);
1449 if (*errorHandler == NULL)
1450 goto onError;
1451 }
1452
1453 if (*exceptionObject == NULL) {
1454 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001455 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 if (*exceptionObject == NULL)
1457 goto onError;
1458 }
1459 else {
1460 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1461 goto onError;
1462 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1463 goto onError;
1464 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1465 goto onError;
1466 }
1467
1468 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1469 if (restuple == NULL)
1470 goto onError;
1471 if (!PyTuple_Check(restuple)) {
1472 PyErr_Format(PyExc_TypeError, &argparse[4]);
1473 goto onError;
1474 }
1475 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1476 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001477
1478 /* Copy back the bytes variables, which might have been modified by the
1479 callback */
1480 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1481 if (!inputobj)
1482 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001483 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001484 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1485 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001486 *input = PyString_AS_STRING(inputobj);
1487 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001488 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001489 /* we can DECREF safely, as the exception has another reference,
1490 so the object won't go away. */
1491 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001494 newpos = insize+newpos;
1495 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001496 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001497 goto onError;
1498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499
1500 /* need more space? (at least enough for what we
1501 have+the replacement+the rest of the string (starting
1502 at the new input position), so we won't have to check space
1503 when there are no errors in the rest of the string) */
1504 repptr = PyUnicode_AS_UNICODE(repunicode);
1505 repsize = PyUnicode_GET_SIZE(repunicode);
1506 requiredsize = *outpos + repsize + insize-newpos;
1507 if (requiredsize > outsize) {
1508 if (requiredsize<2*outsize)
1509 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001510 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001511 goto onError;
1512 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1513 }
1514 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001515 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516 Py_UNICODE_COPY(*outptr, repptr, repsize);
1517 *outptr += repsize;
1518 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001520 /* we made it! */
1521 res = 0;
1522
1523 onError:
1524 Py_XDECREF(restuple);
1525 return res;
1526}
1527
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001528/* --- UTF-7 Codec -------------------------------------------------------- */
1529
1530/* see RFC2152 for details */
1531
Tim Petersced69f82003-09-16 20:30:58 +00001532static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533char utf7_special[128] = {
1534 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1535 encoded:
1536 0 - not special
1537 1 - special
1538 2 - whitespace (optional)
1539 3 - RFC2152 Set O (optional) */
1540 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1542 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1544 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1546 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1548
1549};
1550
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001551/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1552 warnings about the comparison always being false; since
1553 utf7_special[0] is 1, we can safely make that one comparison
1554 true */
1555
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001557 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001558 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559 (encodeO && (utf7_special[(c)] == 3)))
1560
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001561#define B64(n) \
1562 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1563#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001564 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001565#define UB64(c) \
1566 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1567 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001569#define ENCODE(out, ch, bits) \
1570 while (bits >= 6) { \
1571 *out++ = B64(ch >> (bits-6)); \
1572 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 }
1574
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001575#define DECODE(out, ch, bits, surrogate) \
1576 while (bits >= 16) { \
1577 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1578 bits -= 16; \
1579 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001580 /* We have already generated an error for the high surrogate \
1581 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001582 surrogate = 0; \
1583 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001585 it in a 16-bit character */ \
1586 surrogate = 1; \
1587 errmsg = "code pairs are not supported"; \
1588 goto utf7Error; \
1589 } else { \
1590 *out++ = outCh; \
1591 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001592 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001595 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001596 const char *errors)
1597{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001598 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1599}
1600
1601PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1602 Py_ssize_t size,
1603 const char *errors,
1604 Py_ssize_t *consumed)
1605{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001607 Py_ssize_t startinpos;
1608 Py_ssize_t endinpos;
1609 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 const char *e;
1611 PyUnicodeObject *unicode;
1612 Py_UNICODE *p;
1613 const char *errmsg = "";
1614 int inShift = 0;
1615 unsigned int bitsleft = 0;
1616 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 int surrogate = 0;
1618 PyObject *errorHandler = NULL;
1619 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620
1621 unicode = _PyUnicode_New(size);
1622 if (!unicode)
1623 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001624 if (size == 0) {
1625 if (consumed)
1626 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001628 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629
1630 p = unicode->str;
1631 e = s + size;
1632
1633 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 Py_UNICODE ch;
1635 restart:
1636 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637
1638 if (inShift) {
1639 if ((ch == '-') || !B64CHAR(ch)) {
1640 inShift = 0;
1641 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001642
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001643 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1644 if (bitsleft >= 6) {
1645 /* The shift sequence has a partial character in it. If
1646 bitsleft < 6 then we could just classify it as padding
1647 but that is not the case here */
1648
1649 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001650 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 }
1652 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001653 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 here so indicate the potential of a misencoded character. */
1655
1656 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1657 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1658 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001659 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001660 }
1661
1662 if (ch == '-') {
1663 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001664 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 inShift = 1;
1666 }
1667 } else if (SPECIAL(ch,0,0)) {
1668 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001669 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 } else {
1671 *p++ = ch;
1672 }
1673 } else {
1674 charsleft = (charsleft << 6) | UB64(ch);
1675 bitsleft += 6;
1676 s++;
1677 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1678 }
1679 }
1680 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001681 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 s++;
1683 if (s < e && *s == '-') {
1684 s++;
1685 *p++ = '+';
1686 } else
1687 {
1688 inShift = 1;
1689 bitsleft = 0;
1690 }
1691 }
1692 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001693 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001694 errmsg = "unexpected special character";
1695 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001696 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001697 }
1698 else {
1699 *p++ = ch;
1700 s++;
1701 }
1702 continue;
1703 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 outpos = p-PyUnicode_AS_UNICODE(unicode);
1705 endinpos = s-starts;
1706 if (unicode_decode_call_errorhandler(
1707 errors, &errorHandler,
1708 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001709 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001710 (PyObject **)&unicode, &outpos, &p))
1711 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001712 }
1713
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001714 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 outpos = p-PyUnicode_AS_UNICODE(unicode);
1716 endinpos = size;
1717 if (unicode_decode_call_errorhandler(
1718 errors, &errorHandler,
1719 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001720 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001722 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 if (s < e)
1724 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001726 if (consumed) {
1727 if(inShift)
1728 *consumed = startinpos;
1729 else
1730 *consumed = s-starts;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001733 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001734 goto onError;
1735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001736 Py_XDECREF(errorHandler);
1737 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738 return (PyObject *)unicode;
1739
1740onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001741 Py_XDECREF(errorHandler);
1742 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001743 Py_DECREF(unicode);
1744 return NULL;
1745}
1746
1747
1748PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001749 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 int encodeSetO,
1751 int encodeWhiteSpace,
1752 const char *errors)
1753{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001754 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001756 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001758 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759 unsigned int bitsleft = 0;
1760 unsigned long charsleft = 0;
1761 char * out;
1762 char * start;
1763
1764 if (size == 0)
Christian Heimesf3863112007-11-22 07:46:41 +00001765 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001766
Walter Dörwald51ab4142007-05-05 14:43:36 +00001767 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001768 if (v == NULL)
1769 return NULL;
1770
Walter Dörwald51ab4142007-05-05 14:43:36 +00001771 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001772 for (;i < size; ++i) {
1773 Py_UNICODE ch = s[i];
1774
1775 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001776 if (ch == '+') {
1777 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001778 *out++ = '-';
1779 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1780 charsleft = ch;
1781 bitsleft = 16;
1782 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001783 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001784 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001785 } else {
1786 *out++ = (char) ch;
1787 }
1788 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001789 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1790 *out++ = B64(charsleft << (6-bitsleft));
1791 charsleft = 0;
1792 bitsleft = 0;
1793 /* Characters not in the BASE64 set implicitly unshift the sequence
1794 so no '-' is required, except if the character is itself a '-' */
1795 if (B64CHAR(ch) || ch == '-') {
1796 *out++ = '-';
1797 }
1798 inShift = 0;
1799 *out++ = (char) ch;
1800 } else {
1801 bitsleft += 16;
1802 charsleft = (charsleft << 16) | ch;
1803 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1804
1805 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001806 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001807 or '-' then the shift sequence will be terminated implicitly and we
1808 don't have to insert a '-'. */
1809
1810 if (bitsleft == 0) {
1811 if (i + 1 < size) {
1812 Py_UNICODE ch2 = s[i+1];
1813
1814 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001815
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 } else if (B64CHAR(ch2) || ch2 == '-') {
1817 *out++ = '-';
1818 inShift = 0;
1819 } else {
1820 inShift = 0;
1821 }
1822
1823 }
1824 else {
1825 *out++ = '-';
1826 inShift = 0;
1827 }
1828 }
Tim Petersced69f82003-09-16 20:30:58 +00001829 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001831 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001832 if (bitsleft) {
1833 *out++= B64(charsleft << (6-bitsleft) );
1834 *out++ = '-';
1835 }
1836
Guido van Rossum98297ee2007-11-06 21:34:58 +00001837 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1838 Py_DECREF(v);
1839 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840}
1841
1842#undef SPECIAL
1843#undef B64
1844#undef B64CHAR
1845#undef UB64
1846#undef ENCODE
1847#undef DECODE
1848
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849/* --- UTF-8 Codec -------------------------------------------------------- */
1850
Tim Petersced69f82003-09-16 20:30:58 +00001851static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852char utf8_code_length[256] = {
1853 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1854 illegal prefix. see RFC 2279 for details */
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1869 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1870 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1871};
1872
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001874 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 const char *errors)
1876{
Walter Dörwald69652032004-09-07 20:24:22 +00001877 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1878}
1879
1880PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001881 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001882 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001883 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001884{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001887 Py_ssize_t startinpos;
1888 Py_ssize_t endinpos;
1889 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890 const char *e;
1891 PyUnicodeObject *unicode;
1892 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 PyObject *errorHandler = NULL;
1895 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896
1897 /* Note: size will always be longer than the resulting Unicode
1898 character count */
1899 unicode = _PyUnicode_New(size);
1900 if (!unicode)
1901 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001902 if (size == 0) {
1903 if (consumed)
1904 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907
1908 /* Unpack UTF-8 encoded data */
1909 p = unicode->str;
1910 e = s + size;
1911
1912 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001913 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001914
1915 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 s++;
1918 continue;
1919 }
1920
1921 n = utf8_code_length[ch];
1922
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001923 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001924 if (consumed)
1925 break;
1926 else {
1927 errmsg = "unexpected end of data";
1928 startinpos = s-starts;
1929 endinpos = size;
1930 goto utf8Error;
1931 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933
1934 switch (n) {
1935
1936 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 startinpos = s-starts;
1939 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001940 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941
1942 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001943 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001944 startinpos = s-starts;
1945 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001946 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947
1948 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001949 if ((s[1] & 0xc0) != 0x80) {
1950 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001951 startinpos = s-starts;
1952 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001953 goto utf8Error;
1954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001956 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001957 startinpos = s-starts;
1958 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 errmsg = "illegal encoding";
1960 goto utf8Error;
1961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001963 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 break;
1965
1966 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001967 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001968 (s[2] & 0xc0) != 0x80) {
1969 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001970 startinpos = s-starts;
1971 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001972 goto utf8Error;
1973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001975 if (ch < 0x0800) {
1976 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001977 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001978
1979 XXX For wide builds (UCS-4) we should probably try
1980 to recombine the surrogates into a single code
1981 unit.
1982 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001983 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001984 startinpos = s-starts;
1985 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001986 goto utf8Error;
1987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001989 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001990 break;
1991
1992 case 4:
1993 if ((s[1] & 0xc0) != 0x80 ||
1994 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001995 (s[3] & 0xc0) != 0x80) {
1996 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001997 startinpos = s-starts;
1998 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001999 goto utf8Error;
2000 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002001 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2002 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2003 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002004 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002005 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002006 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002007 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002008 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002009 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002010 startinpos = s-starts;
2011 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002012 goto utf8Error;
2013 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002014#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002015 *p++ = (Py_UNICODE)ch;
2016#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002018
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002019 /* translate from 10000..10FFFF to 0..FFFF */
2020 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002021
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002022 /* high surrogate = top 10 bits added to D800 */
2023 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002024
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002025 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002026 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002027#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 break;
2029
2030 default:
2031 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002032 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002033 startinpos = s-starts;
2034 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002035 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 }
2037 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002038 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002040 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041 outpos = p-PyUnicode_AS_UNICODE(unicode);
2042 if (unicode_decode_call_errorhandler(
2043 errors, &errorHandler,
2044 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002045 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002046 (PyObject **)&unicode, &outpos, &p))
2047 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald69652032004-09-07 20:24:22 +00002049 if (consumed)
2050 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002053 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 goto onError;
2055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 Py_XDECREF(errorHandler);
2057 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 return (PyObject *)unicode;
2059
2060onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 Py_XDECREF(errorHandler);
2062 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 Py_DECREF(unicode);
2064 return NULL;
2065}
2066
Tim Peters602f7402002-04-27 18:03:26 +00002067/* Allocation strategy: if the string is short, convert into a stack buffer
2068 and allocate exactly as much space needed at the end. Else allocate the
2069 maximum possible needed (4 result bytes per Unicode character), and return
2070 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002071*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002072PyObject *
2073PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002074 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002075 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076{
Tim Peters602f7402002-04-27 18:03:26 +00002077#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002078
Guido van Rossum98297ee2007-11-06 21:34:58 +00002079 Py_ssize_t i; /* index into s of next input byte */
2080 PyObject *result; /* result string object */
2081 char *p; /* next free byte in output buffer */
2082 Py_ssize_t nallocated; /* number of result bytes allocated */
2083 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002084 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085
Tim Peters602f7402002-04-27 18:03:26 +00002086 assert(s != NULL);
2087 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088
Tim Peters602f7402002-04-27 18:03:26 +00002089 if (size <= MAX_SHORT_UNICHARS) {
2090 /* Write into the stack buffer; nallocated can't overflow.
2091 * At the end, we'll allocate exactly as much heap space as it
2092 * turns out we need.
2093 */
2094 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002095 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002096 p = stackbuf;
2097 }
2098 else {
2099 /* Overallocate on the heap, and give the excess back at the end. */
2100 nallocated = size * 4;
2101 if (nallocated / 4 != size) /* overflow! */
2102 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002103 result = PyString_FromStringAndSize(NULL, nallocated);
2104 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002105 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002106 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002107 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002108
Tim Peters602f7402002-04-27 18:03:26 +00002109 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002110 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002111
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002112 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002113 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002117 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002118 *p++ = (char)(0xc0 | (ch >> 6));
2119 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121 else {
Tim Peters602f7402002-04-27 18:03:26 +00002122 /* Encode UCS2 Unicode ordinals */
2123 if (ch < 0x10000) {
2124 /* Special case: check for high surrogate */
2125 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2126 Py_UCS4 ch2 = s[i];
2127 /* Check for low surrogate and combine the two to
2128 form a UCS4 value */
2129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002130 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002131 i++;
2132 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 }
Tim Peters602f7402002-04-27 18:03:26 +00002134 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002135 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002136 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002137 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2138 *p++ = (char)(0x80 | (ch & 0x3f));
2139 continue;
2140 }
2141encodeUCS4:
2142 /* Encode UCS4 Unicode ordinals */
2143 *p++ = (char)(0xf0 | (ch >> 18));
2144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2146 *p++ = (char)(0x80 | (ch & 0x3f));
2147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002149
Guido van Rossum98297ee2007-11-06 21:34:58 +00002150 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002151 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002152 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002153 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002154 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002155 }
2156 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002157 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002158 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002160 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002162 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002163
Tim Peters602f7402002-04-27 18:03:26 +00002164#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165}
2166
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2168{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 if (!PyUnicode_Check(unicode)) {
2170 PyErr_BadArgument();
2171 return NULL;
2172 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002173 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2174 PyUnicode_GET_SIZE(unicode),
2175 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176}
2177
Walter Dörwald41980ca2007-08-16 21:55:45 +00002178/* --- UTF-32 Codec ------------------------------------------------------- */
2179
2180PyObject *
2181PyUnicode_DecodeUTF32(const char *s,
2182 Py_ssize_t size,
2183 const char *errors,
2184 int *byteorder)
2185{
2186 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2187}
2188
2189PyObject *
2190PyUnicode_DecodeUTF32Stateful(const char *s,
2191 Py_ssize_t size,
2192 const char *errors,
2193 int *byteorder,
2194 Py_ssize_t *consumed)
2195{
2196 const char *starts = s;
2197 Py_ssize_t startinpos;
2198 Py_ssize_t endinpos;
2199 Py_ssize_t outpos;
2200 PyUnicodeObject *unicode;
2201 Py_UNICODE *p;
2202#ifndef Py_UNICODE_WIDE
2203 int i, pairs;
2204#else
2205 const int pairs = 0;
2206#endif
2207 const unsigned char *q, *e;
2208 int bo = 0; /* assume native ordering by default */
2209 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002210 /* Offsets from q for retrieving bytes in the right order. */
2211#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2212 int iorder[] = {0, 1, 2, 3};
2213#else
2214 int iorder[] = {3, 2, 1, 0};
2215#endif
2216 PyObject *errorHandler = NULL;
2217 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002218 /* On narrow builds we split characters outside the BMP into two
2219 codepoints => count how much extra space we need. */
2220#ifndef Py_UNICODE_WIDE
2221 for (i = pairs = 0; i < size/4; i++)
2222 if (((Py_UCS4 *)s)[i] >= 0x10000)
2223 pairs++;
2224#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002225
2226 /* This might be one to much, because of a BOM */
2227 unicode = _PyUnicode_New((size+3)/4+pairs);
2228 if (!unicode)
2229 return NULL;
2230 if (size == 0)
2231 return (PyObject *)unicode;
2232
2233 /* Unpack UTF-32 encoded data */
2234 p = unicode->str;
2235 q = (unsigned char *)s;
2236 e = q + size;
2237
2238 if (byteorder)
2239 bo = *byteorder;
2240
2241 /* Check for BOM marks (U+FEFF) in the input and adjust current
2242 byte order setting accordingly. In native mode, the leading BOM
2243 mark is skipped, in all other modes, it is copied to the output
2244 stream as-is (giving a ZWNBSP character). */
2245 if (bo == 0) {
2246 if (size >= 4) {
2247 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2248 (q[iorder[1]] << 8) | q[iorder[0]];
2249#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2250 if (bom == 0x0000FEFF) {
2251 q += 4;
2252 bo = -1;
2253 }
2254 else if (bom == 0xFFFE0000) {
2255 q += 4;
2256 bo = 1;
2257 }
2258#else
2259 if (bom == 0x0000FEFF) {
2260 q += 4;
2261 bo = 1;
2262 }
2263 else if (bom == 0xFFFE0000) {
2264 q += 4;
2265 bo = -1;
2266 }
2267#endif
2268 }
2269 }
2270
2271 if (bo == -1) {
2272 /* force LE */
2273 iorder[0] = 0;
2274 iorder[1] = 1;
2275 iorder[2] = 2;
2276 iorder[3] = 3;
2277 }
2278 else if (bo == 1) {
2279 /* force BE */
2280 iorder[0] = 3;
2281 iorder[1] = 2;
2282 iorder[2] = 1;
2283 iorder[3] = 0;
2284 }
2285
2286 while (q < e) {
2287 Py_UCS4 ch;
2288 /* remaining bytes at the end? (size should be divisible by 4) */
2289 if (e-q<4) {
2290 if (consumed)
2291 break;
2292 errmsg = "truncated data";
2293 startinpos = ((const char *)q)-starts;
2294 endinpos = ((const char *)e)-starts;
2295 goto utf32Error;
2296 /* The remaining input chars are ignored if the callback
2297 chooses to skip the input */
2298 }
2299 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2300 (q[iorder[1]] << 8) | q[iorder[0]];
2301
2302 if (ch >= 0x110000)
2303 {
2304 errmsg = "codepoint not in range(0x110000)";
2305 startinpos = ((const char *)q)-starts;
2306 endinpos = startinpos+4;
2307 goto utf32Error;
2308 }
2309#ifndef Py_UNICODE_WIDE
2310 if (ch >= 0x10000)
2311 {
2312 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2313 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2314 }
2315 else
2316#endif
2317 *p++ = ch;
2318 q += 4;
2319 continue;
2320 utf32Error:
2321 outpos = p-PyUnicode_AS_UNICODE(unicode);
2322 if (unicode_decode_call_errorhandler(
2323 errors, &errorHandler,
2324 "utf32", errmsg,
2325 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2326 (PyObject **)&unicode, &outpos, &p))
2327 goto onError;
2328 }
2329
2330 if (byteorder)
2331 *byteorder = bo;
2332
2333 if (consumed)
2334 *consumed = (const char *)q-starts;
2335
2336 /* Adjust length */
2337 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2338 goto onError;
2339
2340 Py_XDECREF(errorHandler);
2341 Py_XDECREF(exc);
2342 return (PyObject *)unicode;
2343
2344onError:
2345 Py_DECREF(unicode);
2346 Py_XDECREF(errorHandler);
2347 Py_XDECREF(exc);
2348 return NULL;
2349}
2350
2351PyObject *
2352PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2353 Py_ssize_t size,
2354 const char *errors,
2355 int byteorder)
2356{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002357 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002358 unsigned char *p;
2359#ifndef Py_UNICODE_WIDE
2360 int i, pairs;
2361#else
2362 const int pairs = 0;
2363#endif
2364 /* Offsets from p for storing byte pairs in the right order. */
2365#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366 int iorder[] = {0, 1, 2, 3};
2367#else
2368 int iorder[] = {3, 2, 1, 0};
2369#endif
2370
2371#define STORECHAR(CH) \
2372 do { \
2373 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2374 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2375 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2376 p[iorder[0]] = (CH) & 0xff; \
2377 p += 4; \
2378 } while(0)
2379
2380 /* In narrow builds we can output surrogate pairs as one codepoint,
2381 so we need less space. */
2382#ifndef Py_UNICODE_WIDE
2383 for (i = pairs = 0; i < size-1; i++)
2384 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386 pairs++;
2387#endif
2388 v = PyBytes_FromStringAndSize(NULL,
2389 4 * (size - pairs + (byteorder == 0)));
2390 if (v == NULL)
2391 return NULL;
2392
2393 p = (unsigned char *)PyBytes_AS_STRING(v);
2394 if (byteorder == 0)
2395 STORECHAR(0xFEFF);
2396 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002397 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002398
2399 if (byteorder == -1) {
2400 /* force LE */
2401 iorder[0] = 0;
2402 iorder[1] = 1;
2403 iorder[2] = 2;
2404 iorder[3] = 3;
2405 }
2406 else if (byteorder == 1) {
2407 /* force BE */
2408 iorder[0] = 3;
2409 iorder[1] = 2;
2410 iorder[2] = 1;
2411 iorder[3] = 0;
2412 }
2413
2414 while (size-- > 0) {
2415 Py_UCS4 ch = *s++;
2416#ifndef Py_UNICODE_WIDE
2417 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2418 Py_UCS4 ch2 = *s;
2419 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2420 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2421 s++;
2422 size--;
2423 }
2424 }
2425#endif
2426 STORECHAR(ch);
2427 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002428
2429 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002430 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002431 Py_DECREF(v);
2432 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002433#undef STORECHAR
2434}
2435
2436PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2437{
2438 if (!PyUnicode_Check(unicode)) {
2439 PyErr_BadArgument();
2440 return NULL;
2441 }
2442 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2443 PyUnicode_GET_SIZE(unicode),
2444 NULL,
2445 0);
2446}
2447
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448/* --- UTF-16 Codec ------------------------------------------------------- */
2449
Tim Peters772747b2001-08-09 22:21:55 +00002450PyObject *
2451PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002452 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002453 const char *errors,
2454 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455{
Walter Dörwald69652032004-09-07 20:24:22 +00002456 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2457}
2458
2459PyObject *
2460PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002461 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002462 const char *errors,
2463 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002464 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002465{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002467 Py_ssize_t startinpos;
2468 Py_ssize_t endinpos;
2469 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470 PyUnicodeObject *unicode;
2471 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002472 const unsigned char *q, *e;
2473 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002474 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002475 /* Offsets from q for retrieving byte pairs in the right order. */
2476#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2477 int ihi = 1, ilo = 0;
2478#else
2479 int ihi = 0, ilo = 1;
2480#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 PyObject *errorHandler = NULL;
2482 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483
2484 /* Note: size will always be longer than the resulting Unicode
2485 character count */
2486 unicode = _PyUnicode_New(size);
2487 if (!unicode)
2488 return NULL;
2489 if (size == 0)
2490 return (PyObject *)unicode;
2491
2492 /* Unpack UTF-16 encoded data */
2493 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002494 q = (unsigned char *)s;
2495 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496
2497 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002498 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002500 /* Check for BOM marks (U+FEFF) in the input and adjust current
2501 byte order setting accordingly. In native mode, the leading BOM
2502 mark is skipped, in all other modes, it is copied to the output
2503 stream as-is (giving a ZWNBSP character). */
2504 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002505 if (size >= 2) {
2506 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002507#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002508 if (bom == 0xFEFF) {
2509 q += 2;
2510 bo = -1;
2511 }
2512 else if (bom == 0xFFFE) {
2513 q += 2;
2514 bo = 1;
2515 }
Tim Petersced69f82003-09-16 20:30:58 +00002516#else
Walter Dörwald69652032004-09-07 20:24:22 +00002517 if (bom == 0xFEFF) {
2518 q += 2;
2519 bo = 1;
2520 }
2521 else if (bom == 0xFFFE) {
2522 q += 2;
2523 bo = -1;
2524 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002525#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002526 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528
Tim Peters772747b2001-08-09 22:21:55 +00002529 if (bo == -1) {
2530 /* force LE */
2531 ihi = 1;
2532 ilo = 0;
2533 }
2534 else if (bo == 1) {
2535 /* force BE */
2536 ihi = 0;
2537 ilo = 1;
2538 }
2539
2540 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002542 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002544 if (consumed)
2545 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002546 errmsg = "truncated data";
2547 startinpos = ((const char *)q)-starts;
2548 endinpos = ((const char *)e)-starts;
2549 goto utf16Error;
2550 /* The remaining input chars are ignored if the callback
2551 chooses to skip the input */
2552 }
2553 ch = (q[ihi] << 8) | q[ilo];
2554
Tim Peters772747b2001-08-09 22:21:55 +00002555 q += 2;
2556
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 if (ch < 0xD800 || ch > 0xDFFF) {
2558 *p++ = ch;
2559 continue;
2560 }
2561
2562 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002563 if (q >= e) {
2564 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002565 startinpos = (((const char *)q)-2)-starts;
2566 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002567 goto utf16Error;
2568 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002569 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002570 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2571 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002572 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002573#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002574 *p++ = ch;
2575 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002576#else
2577 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002578#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002579 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002580 }
2581 else {
2582 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 startinpos = (((const char *)q)-4)-starts;
2584 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002585 goto utf16Error;
2586 }
2587
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002589 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590 startinpos = (((const char *)q)-2)-starts;
2591 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002592 /* Fall through to report the error */
2593
2594 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 outpos = p-PyUnicode_AS_UNICODE(unicode);
2596 if (unicode_decode_call_errorhandler(
2597 errors, &errorHandler,
2598 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002599 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002601 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 }
2603
2604 if (byteorder)
2605 *byteorder = bo;
2606
Walter Dörwald69652032004-09-07 20:24:22 +00002607 if (consumed)
2608 *consumed = (const char *)q-starts;
2609
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002611 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 goto onError;
2613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614 Py_XDECREF(errorHandler);
2615 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 return (PyObject *)unicode;
2617
2618onError:
2619 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002620 Py_XDECREF(errorHandler);
2621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 return NULL;
2623}
2624
Tim Peters772747b2001-08-09 22:21:55 +00002625PyObject *
2626PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002627 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002628 const char *errors,
2629 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002631 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002632 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002633#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002634 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002635#else
2636 const int pairs = 0;
2637#endif
Tim Peters772747b2001-08-09 22:21:55 +00002638 /* Offsets from p for storing byte pairs in the right order. */
2639#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640 int ihi = 1, ilo = 0;
2641#else
2642 int ihi = 0, ilo = 1;
2643#endif
2644
2645#define STORECHAR(CH) \
2646 do { \
2647 p[ihi] = ((CH) >> 8) & 0xff; \
2648 p[ilo] = (CH) & 0xff; \
2649 p += 2; \
2650 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002652#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002653 for (i = pairs = 0; i < size; i++)
2654 if (s[i] >= 0x10000)
2655 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002656#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002657 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002658 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 if (v == NULL)
2660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661
Walter Dörwald3cc34522007-05-04 10:48:27 +00002662 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002664 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002665 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002666 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002667
2668 if (byteorder == -1) {
2669 /* force LE */
2670 ihi = 1;
2671 ilo = 0;
2672 }
2673 else if (byteorder == 1) {
2674 /* force BE */
2675 ihi = 0;
2676 ilo = 1;
2677 }
2678
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002679 while (size-- > 0) {
2680 Py_UNICODE ch = *s++;
2681 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002682#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002683 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002684 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2685 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002687#endif
Tim Peters772747b2001-08-09 22:21:55 +00002688 STORECHAR(ch);
2689 if (ch2)
2690 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002691 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002692
2693 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002694 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002695 Py_DECREF(v);
2696 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002697#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698}
2699
2700PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2701{
2702 if (!PyUnicode_Check(unicode)) {
2703 PyErr_BadArgument();
2704 return NULL;
2705 }
2706 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2707 PyUnicode_GET_SIZE(unicode),
2708 NULL,
2709 0);
2710}
2711
2712/* --- Unicode Escape Codec ----------------------------------------------- */
2713
Fredrik Lundh06d12682001-01-24 07:59:11 +00002714static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002715
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002717 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 const char *errors)
2719{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002721 Py_ssize_t startinpos;
2722 Py_ssize_t endinpos;
2723 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002728 char* message;
2729 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002730 PyObject *errorHandler = NULL;
2731 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002732
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 /* Escaped strings will always be longer than the resulting
2734 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 length after conversion to the true value.
2736 (but if the error callback returns a long replacement string
2737 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 v = _PyUnicode_New(size);
2739 if (v == NULL)
2740 goto onError;
2741 if (size == 0)
2742 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 while (s < end) {
2748 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002749 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002750 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751
2752 /* Non-escape characters are interpreted as Unicode ordinals */
2753 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 continue;
2756 }
2757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 /* \ - Escapes */
2760 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002761 c = *s++;
2762 if (s > end)
2763 c = '\0'; /* Invalid after \ */
2764 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765
2766 /* \x escapes */
2767 case '\n': break;
2768 case '\\': *p++ = '\\'; break;
2769 case '\'': *p++ = '\''; break;
2770 case '\"': *p++ = '\"'; break;
2771 case 'b': *p++ = '\b'; break;
2772 case 'f': *p++ = '\014'; break; /* FF */
2773 case 't': *p++ = '\t'; break;
2774 case 'n': *p++ = '\n'; break;
2775 case 'r': *p++ = '\r'; break;
2776 case 'v': *p++ = '\013'; break; /* VT */
2777 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2778
2779 /* \OOO (octal) escapes */
2780 case '0': case '1': case '2': case '3':
2781 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002782 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002783 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002784 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002785 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002786 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002788 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 break;
2790
Fredrik Lundhccc74732001-02-18 22:13:49 +00002791 /* hex escapes */
2792 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002794 digits = 2;
2795 message = "truncated \\xXX escape";
2796 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797
Fredrik Lundhccc74732001-02-18 22:13:49 +00002798 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002800 digits = 4;
2801 message = "truncated \\uXXXX escape";
2802 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803
Fredrik Lundhccc74732001-02-18 22:13:49 +00002804 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002805 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002806 digits = 8;
2807 message = "truncated \\UXXXXXXXX escape";
2808 hexescape:
2809 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 outpos = p-PyUnicode_AS_UNICODE(v);
2811 if (s+digits>end) {
2812 endinpos = size;
2813 if (unicode_decode_call_errorhandler(
2814 errors, &errorHandler,
2815 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002816 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 (PyObject **)&v, &outpos, &p))
2818 goto onError;
2819 goto nextByte;
2820 }
2821 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002823 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 endinpos = (s+i+1)-starts;
2825 if (unicode_decode_call_errorhandler(
2826 errors, &errorHandler,
2827 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002828 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002830 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002832 }
2833 chr = (chr<<4) & ~0xF;
2834 if (c >= '0' && c <= '9')
2835 chr += c - '0';
2836 else if (c >= 'a' && c <= 'f')
2837 chr += 10 + c - 'a';
2838 else
2839 chr += 10 + c - 'A';
2840 }
2841 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002842 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843 /* _decoding_error will have already written into the
2844 target buffer. */
2845 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002846 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002847 /* when we get here, chr is a 32-bit unicode character */
2848 if (chr <= 0xffff)
2849 /* UCS-2 character */
2850 *p++ = (Py_UNICODE) chr;
2851 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002852 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002853 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002854#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002855 *p++ = chr;
2856#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002857 chr -= 0x10000L;
2858 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002859 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002860#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002861 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002862 endinpos = s-starts;
2863 outpos = p-PyUnicode_AS_UNICODE(v);
2864 if (unicode_decode_call_errorhandler(
2865 errors, &errorHandler,
2866 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002867 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002868 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002869 goto onError;
2870 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002871 break;
2872
2873 /* \N{name} */
2874 case 'N':
2875 message = "malformed \\N character escape";
2876 if (ucnhash_CAPI == NULL) {
2877 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002878 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002879 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002880 if (m == NULL)
2881 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002882 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002884 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002885 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002886 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002887 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 if (ucnhash_CAPI == NULL)
2889 goto ucnhashError;
2890 }
2891 if (*s == '{') {
2892 const char *start = s+1;
2893 /* look for the closing brace */
2894 while (*s != '}' && s < end)
2895 s++;
2896 if (s > start && s < end && *s == '}') {
2897 /* found a name. look it up in the unicode database */
2898 message = "unknown Unicode character name";
2899 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002900 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002901 goto store;
2902 }
2903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904 endinpos = s-starts;
2905 outpos = p-PyUnicode_AS_UNICODE(v);
2906 if (unicode_decode_call_errorhandler(
2907 errors, &errorHandler,
2908 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002909 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002911 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002912 break;
2913
2914 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002915 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916 message = "\\ at end of string";
2917 s--;
2918 endinpos = s-starts;
2919 outpos = p-PyUnicode_AS_UNICODE(v);
2920 if (unicode_decode_call_errorhandler(
2921 errors, &errorHandler,
2922 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002923 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002924 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002925 goto onError;
2926 }
2927 else {
2928 *p++ = '\\';
2929 *p++ = (unsigned char)s[-1];
2930 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002931 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 nextByte:
2934 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002936 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002938 Py_XDECREF(errorHandler);
2939 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002941
Fredrik Lundhccc74732001-02-18 22:13:49 +00002942ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002943 PyErr_SetString(
2944 PyExc_UnicodeError,
2945 "\\N escapes not supported (can't load unicodedata module)"
2946 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002947 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002948 Py_XDECREF(errorHandler);
2949 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002950 return NULL;
2951
Fredrik Lundhccc74732001-02-18 22:13:49 +00002952onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002954 Py_XDECREF(errorHandler);
2955 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 return NULL;
2957}
2958
2959/* Return a Unicode-Escape string version of the Unicode object.
2960
2961 If quotes is true, the string is enclosed in u"" or u'' quotes as
2962 appropriate.
2963
2964*/
2965
Thomas Wouters477c8d52006-05-27 19:21:47 +00002966Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2967 Py_ssize_t size,
2968 Py_UNICODE ch)
2969{
2970 /* like wcschr, but doesn't stop at NULL characters */
2971
2972 while (size-- > 0) {
2973 if (*s == ch)
2974 return s;
2975 s++;
2976 }
2977
2978 return NULL;
2979}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002980
Walter Dörwald79e913e2007-05-12 11:08:06 +00002981static const char *hexdigits = "0123456789abcdef";
2982
2983PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2984 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002986 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988
Thomas Wouters89f507f2006-12-13 04:49:30 +00002989 /* XXX(nnorwitz): rather than over-allocating, it would be
2990 better to choose a different scheme. Perhaps scan the
2991 first N-chars of the string and allocate based on that size.
2992 */
2993 /* Initial allocation is based on the longest-possible unichr
2994 escape.
2995
2996 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2997 unichr, so in this case it's the longest unichr escape. In
2998 narrow (UTF-16) builds this is five chars per source unichr
2999 since there are two unichrs in the surrogate pair, so in narrow
3000 (UTF-16) builds it's not the longest unichr escape.
3001
3002 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3003 so in the narrow (UTF-16) build case it's the longest unichr
3004 escape.
3005 */
3006
Walter Dörwald79e913e2007-05-12 11:08:06 +00003007 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00003008#ifdef Py_UNICODE_WIDE
3009 + 10*size
3010#else
3011 + 6*size
3012#endif
3013 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 if (repr == NULL)
3015 return NULL;
3016
Walter Dörwald79e913e2007-05-12 11:08:06 +00003017 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 while (size-- > 0) {
3020 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003021
Walter Dörwald79e913e2007-05-12 11:08:06 +00003022 /* Escape backslashes */
3023 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 *p++ = '\\';
3025 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003026 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003027 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003028
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003029#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003030 /* Map 21-bit characters to '\U00xxxxxx' */
3031 else if (ch >= 0x10000) {
3032 *p++ = '\\';
3033 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003034 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3035 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3036 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3037 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3038 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3039 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3040 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3041 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003042 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003043 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003044#else
3045 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003046 else if (ch >= 0xD800 && ch < 0xDC00) {
3047 Py_UNICODE ch2;
3048 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003049
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003050 ch2 = *s++;
3051 size--;
3052 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3053 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3054 *p++ = '\\';
3055 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003056 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3057 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3058 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3059 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3060 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3061 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3062 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3063 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003064 continue;
3065 }
3066 /* Fall through: isolated surrogates are copied as-is */
3067 s--;
3068 size++;
3069 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003070#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003071
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003073 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 *p++ = '\\';
3075 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003076 *p++ = hexdigits[(ch >> 12) & 0x000F];
3077 *p++ = hexdigits[(ch >> 8) & 0x000F];
3078 *p++ = hexdigits[(ch >> 4) & 0x000F];
3079 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003081
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003082 /* Map special whitespace to '\t', \n', '\r' */
3083 else if (ch == '\t') {
3084 *p++ = '\\';
3085 *p++ = 't';
3086 }
3087 else if (ch == '\n') {
3088 *p++ = '\\';
3089 *p++ = 'n';
3090 }
3091 else if (ch == '\r') {
3092 *p++ = '\\';
3093 *p++ = 'r';
3094 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003095
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003096 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003097 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003099 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003100 *p++ = hexdigits[(ch >> 4) & 0x000F];
3101 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003102 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003103
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 /* Copy everything else as-is */
3105 else
3106 *p++ = (char) ch;
3107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108
Guido van Rossum98297ee2007-11-06 21:34:58 +00003109 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3110 p - PyBytes_AS_STRING(repr));
3111 Py_DECREF(repr);
3112 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113}
3114
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3116{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003117 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118 if (!PyUnicode_Check(unicode)) {
3119 PyErr_BadArgument();
3120 return NULL;
3121 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003122 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3123 PyUnicode_GET_SIZE(unicode));
3124
3125 if (!s)
3126 return NULL;
3127 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3128 PyBytes_GET_SIZE(s));
3129 Py_DECREF(s);
3130 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131}
3132
3133/* --- Raw Unicode Escape Codec ------------------------------------------- */
3134
3135PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003136 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 const char *errors)
3138{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003139 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003140 Py_ssize_t startinpos;
3141 Py_ssize_t endinpos;
3142 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003144 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 const char *end;
3146 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 PyObject *errorHandler = NULL;
3148 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003149
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 /* Escaped strings will always be longer than the resulting
3151 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003152 length after conversion to the true value. (But decoding error
3153 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 v = _PyUnicode_New(size);
3155 if (v == NULL)
3156 goto onError;
3157 if (size == 0)
3158 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 end = s + size;
3161 while (s < end) {
3162 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003163 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003165 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166
3167 /* Non-escape characters are interpreted as Unicode ordinals */
3168 if (*s != '\\') {
3169 *p++ = (unsigned char)*s++;
3170 continue;
3171 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173
3174 /* \u-escapes are only interpreted iff the number of leading
3175 backslashes if odd */
3176 bs = s;
3177 for (;s < end;) {
3178 if (*s != '\\')
3179 break;
3180 *p++ = (unsigned char)*s++;
3181 }
3182 if (((s - bs) & 1) == 0 ||
3183 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003184 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 continue;
3186 }
3187 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003188 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 s++;
3190
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003191 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003193 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003195 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 endinpos = s-starts;
3197 if (unicode_decode_call_errorhandler(
3198 errors, &errorHandler,
3199 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003200 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003203 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 }
3205 x = (x<<4) & ~0xF;
3206 if (c >= '0' && c <= '9')
3207 x += c - '0';
3208 else if (c >= 'a' && c <= 'f')
3209 x += 10 + c - 'a';
3210 else
3211 x += 10 + c - 'A';
3212 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003213 if (x <= 0xffff)
3214 /* UCS-2 character */
3215 *p++ = (Py_UNICODE) x;
3216 else if (x <= 0x10ffff) {
3217 /* UCS-4 character. Either store directly, or as
3218 surrogate pair. */
3219#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003220 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003221#else
3222 x -= 0x10000L;
3223 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3224 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3225#endif
3226 } else {
3227 endinpos = s-starts;
3228 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003229 if (unicode_decode_call_errorhandler(
3230 errors, &errorHandler,
3231 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003232 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003233 (PyObject **)&v, &outpos, &p))
3234 goto onError;
3235 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 nextByte:
3237 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003239 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003240 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 Py_XDECREF(errorHandler);
3242 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003244
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 onError:
3246 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 Py_XDECREF(errorHandler);
3248 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 return NULL;
3250}
3251
3252PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003253 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003255 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 char *p;
3257 char *q;
3258
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003259#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003260 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003261#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003262 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003263#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 if (repr == NULL)
3265 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003266 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003267 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268
Walter Dörwald711005d2007-05-12 12:03:26 +00003269 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 while (size-- > 0) {
3271 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003272#ifdef Py_UNICODE_WIDE
3273 /* Map 32-bit characters to '\Uxxxxxxxx' */
3274 if (ch >= 0x10000) {
3275 *p++ = '\\';
3276 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003277 *p++ = hexdigits[(ch >> 28) & 0xf];
3278 *p++ = hexdigits[(ch >> 24) & 0xf];
3279 *p++ = hexdigits[(ch >> 20) & 0xf];
3280 *p++ = hexdigits[(ch >> 16) & 0xf];
3281 *p++ = hexdigits[(ch >> 12) & 0xf];
3282 *p++ = hexdigits[(ch >> 8) & 0xf];
3283 *p++ = hexdigits[(ch >> 4) & 0xf];
3284 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003285 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003286 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003287#else
3288 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3289 if (ch >= 0xD800 && ch < 0xDC00) {
3290 Py_UNICODE ch2;
3291 Py_UCS4 ucs;
3292
3293 ch2 = *s++;
3294 size--;
3295 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3296 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3297 *p++ = '\\';
3298 *p++ = 'U';
3299 *p++ = hexdigits[(ucs >> 28) & 0xf];
3300 *p++ = hexdigits[(ucs >> 24) & 0xf];
3301 *p++ = hexdigits[(ucs >> 20) & 0xf];
3302 *p++ = hexdigits[(ucs >> 16) & 0xf];
3303 *p++ = hexdigits[(ucs >> 12) & 0xf];
3304 *p++ = hexdigits[(ucs >> 8) & 0xf];
3305 *p++ = hexdigits[(ucs >> 4) & 0xf];
3306 *p++ = hexdigits[ucs & 0xf];
3307 continue;
3308 }
3309 /* Fall through: isolated surrogates are copied as-is */
3310 s--;
3311 size++;
3312 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003313#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 /* Map 16-bit characters to '\uxxxx' */
3315 if (ch >= 256) {
3316 *p++ = '\\';
3317 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003318 *p++ = hexdigits[(ch >> 12) & 0xf];
3319 *p++ = hexdigits[(ch >> 8) & 0xf];
3320 *p++ = hexdigits[(ch >> 4) & 0xf];
3321 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322 }
3323 /* Copy everything else as-is */
3324 else
3325 *p++ = (char) ch;
3326 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003327 size = p - q;
3328
3329 done:
3330 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3331 Py_DECREF(repr);
3332 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333}
3334
3335PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3336{
Walter Dörwald711005d2007-05-12 12:03:26 +00003337 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003339 PyErr_BadArgument();
3340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003342 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3343 PyUnicode_GET_SIZE(unicode));
3344
3345 if (!s)
3346 return NULL;
3347 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3348 PyBytes_GET_SIZE(s));
3349 Py_DECREF(s);
3350 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351}
3352
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003353/* --- Unicode Internal Codec ------------------------------------------- */
3354
3355PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003356 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003357 const char *errors)
3358{
3359 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003360 Py_ssize_t startinpos;
3361 Py_ssize_t endinpos;
3362 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003363 PyUnicodeObject *v;
3364 Py_UNICODE *p;
3365 const char *end;
3366 const char *reason;
3367 PyObject *errorHandler = NULL;
3368 PyObject *exc = NULL;
3369
Neal Norwitzd43069c2006-01-08 01:12:10 +00003370#ifdef Py_UNICODE_WIDE
3371 Py_UNICODE unimax = PyUnicode_GetMax();
3372#endif
3373
Thomas Wouters89f507f2006-12-13 04:49:30 +00003374 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003375 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3376 if (v == NULL)
3377 goto onError;
3378 if (PyUnicode_GetSize((PyObject *)v) == 0)
3379 return (PyObject *)v;
3380 p = PyUnicode_AS_UNICODE(v);
3381 end = s + size;
3382
3383 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003384 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003385 /* We have to sanity check the raw data, otherwise doom looms for
3386 some malformed UCS-4 data. */
3387 if (
3388 #ifdef Py_UNICODE_WIDE
3389 *p > unimax || *p < 0 ||
3390 #endif
3391 end-s < Py_UNICODE_SIZE
3392 )
3393 {
3394 startinpos = s - starts;
3395 if (end-s < Py_UNICODE_SIZE) {
3396 endinpos = end-starts;
3397 reason = "truncated input";
3398 }
3399 else {
3400 endinpos = s - starts + Py_UNICODE_SIZE;
3401 reason = "illegal code point (> 0x10FFFF)";
3402 }
3403 outpos = p - PyUnicode_AS_UNICODE(v);
3404 if (unicode_decode_call_errorhandler(
3405 errors, &errorHandler,
3406 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003407 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003408 (PyObject **)&v, &outpos, &p)) {
3409 goto onError;
3410 }
3411 }
3412 else {
3413 p++;
3414 s += Py_UNICODE_SIZE;
3415 }
3416 }
3417
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003418 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003419 goto onError;
3420 Py_XDECREF(errorHandler);
3421 Py_XDECREF(exc);
3422 return (PyObject *)v;
3423
3424 onError:
3425 Py_XDECREF(v);
3426 Py_XDECREF(errorHandler);
3427 Py_XDECREF(exc);
3428 return NULL;
3429}
3430
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431/* --- Latin-1 Codec ------------------------------------------------------ */
3432
3433PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003434 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 const char *errors)
3436{
3437 PyUnicodeObject *v;
3438 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003439
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003441 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003442 Py_UNICODE r = *(unsigned char*)s;
3443 return PyUnicode_FromUnicode(&r, 1);
3444 }
3445
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 v = _PyUnicode_New(size);
3447 if (v == NULL)
3448 goto onError;
3449 if (size == 0)
3450 return (PyObject *)v;
3451 p = PyUnicode_AS_UNICODE(v);
3452 while (size-- > 0)
3453 *p++ = (unsigned char)*s++;
3454 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003455
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 onError:
3457 Py_XDECREF(v);
3458 return NULL;
3459}
3460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461/* create or adjust a UnicodeEncodeError */
3462static void make_encode_exception(PyObject **exceptionObject,
3463 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003464 const Py_UNICODE *unicode, Py_ssize_t size,
3465 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 if (*exceptionObject == NULL) {
3469 *exceptionObject = PyUnicodeEncodeError_Create(
3470 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 }
3472 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3474 goto onError;
3475 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3476 goto onError;
3477 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3478 goto onError;
3479 return;
3480 onError:
3481 Py_DECREF(*exceptionObject);
3482 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 }
3484}
3485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486/* raises a UnicodeEncodeError */
3487static void raise_encode_exception(PyObject **exceptionObject,
3488 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003489 const Py_UNICODE *unicode, Py_ssize_t size,
3490 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 const char *reason)
3492{
3493 make_encode_exception(exceptionObject,
3494 encoding, unicode, size, startpos, endpos, reason);
3495 if (*exceptionObject != NULL)
3496 PyCodec_StrictErrors(*exceptionObject);
3497}
3498
3499/* error handling callback helper:
3500 build arguments, call the callback and check the arguments,
3501 put the result into newpos and return the replacement string, which
3502 has to be freed by the caller */
3503static PyObject *unicode_encode_call_errorhandler(const char *errors,
3504 PyObject **errorHandler,
3505 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003506 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3507 Py_ssize_t startpos, Py_ssize_t endpos,
3508 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003510 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511
3512 PyObject *restuple;
3513 PyObject *resunicode;
3514
3515 if (*errorHandler == NULL) {
3516 *errorHandler = PyCodec_LookupError(errors);
3517 if (*errorHandler == NULL)
3518 return NULL;
3519 }
3520
3521 make_encode_exception(exceptionObject,
3522 encoding, unicode, size, startpos, endpos, reason);
3523 if (*exceptionObject == NULL)
3524 return NULL;
3525
3526 restuple = PyObject_CallFunctionObjArgs(
3527 *errorHandler, *exceptionObject, NULL);
3528 if (restuple == NULL)
3529 return NULL;
3530 if (!PyTuple_Check(restuple)) {
3531 PyErr_Format(PyExc_TypeError, &argparse[4]);
3532 Py_DECREF(restuple);
3533 return NULL;
3534 }
3535 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3536 &resunicode, newpos)) {
3537 Py_DECREF(restuple);
3538 return NULL;
3539 }
3540 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003541 *newpos = size+*newpos;
3542 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003543 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003544 Py_DECREF(restuple);
3545 return NULL;
3546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 Py_INCREF(resunicode);
3548 Py_DECREF(restuple);
3549 return resunicode;
3550}
3551
3552static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003553 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 const char *errors,
3555 int limit)
3556{
3557 /* output object */
3558 PyObject *res;
3559 /* pointers to the beginning and end+1 of input */
3560 const Py_UNICODE *startp = p;
3561 const Py_UNICODE *endp = p + size;
3562 /* pointer to the beginning of the unencodable characters */
3563 /* const Py_UNICODE *badp = NULL; */
3564 /* pointer into the output */
3565 char *str;
3566 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003568 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3569 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 PyObject *errorHandler = NULL;
3571 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003572 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 /* the following variable is used for caching string comparisons
3574 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3575 int known_errorHandler = -1;
3576
3577 /* allocate enough for a simple encoding without
3578 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003579 if (size == 0)
3580 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003581 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003583 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003584 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 ressize = size;
3586
3587 while (p<endp) {
3588 Py_UNICODE c = *p;
3589
3590 /* can we encode this? */
3591 if (c<limit) {
3592 /* no overflow check, because we know that the space is enough */
3593 *str++ = (char)c;
3594 ++p;
3595 }
3596 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003597 Py_ssize_t unicodepos = p-startp;
3598 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003600 Py_ssize_t repsize;
3601 Py_ssize_t newpos;
3602 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 Py_UNICODE *uni2;
3604 /* startpos for collecting unencodable chars */
3605 const Py_UNICODE *collstart = p;
3606 const Py_UNICODE *collend = p;
3607 /* find all unecodable characters */
3608 while ((collend < endp) && ((*collend)>=limit))
3609 ++collend;
3610 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3611 if (known_errorHandler==-1) {
3612 if ((errors==NULL) || (!strcmp(errors, "strict")))
3613 known_errorHandler = 1;
3614 else if (!strcmp(errors, "replace"))
3615 known_errorHandler = 2;
3616 else if (!strcmp(errors, "ignore"))
3617 known_errorHandler = 3;
3618 else if (!strcmp(errors, "xmlcharrefreplace"))
3619 known_errorHandler = 4;
3620 else
3621 known_errorHandler = 0;
3622 }
3623 switch (known_errorHandler) {
3624 case 1: /* strict */
3625 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3626 goto onError;
3627 case 2: /* replace */
3628 while (collstart++<collend)
3629 *str++ = '?'; /* fall through */
3630 case 3: /* ignore */
3631 p = collend;
3632 break;
3633 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003634 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 /* determine replacement size (temporarily (mis)uses p) */
3636 for (p = collstart, repsize = 0; p < collend; ++p) {
3637 if (*p<10)
3638 repsize += 2+1+1;
3639 else if (*p<100)
3640 repsize += 2+2+1;
3641 else if (*p<1000)
3642 repsize += 2+3+1;
3643 else if (*p<10000)
3644 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003645#ifndef Py_UNICODE_WIDE
3646 else
3647 repsize += 2+5+1;
3648#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 else if (*p<100000)
3650 repsize += 2+5+1;
3651 else if (*p<1000000)
3652 repsize += 2+6+1;
3653 else
3654 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003655#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 }
3657 requiredsize = respos+repsize+(endp-collend);
3658 if (requiredsize > ressize) {
3659 if (requiredsize<2*ressize)
3660 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003661 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003663 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 ressize = requiredsize;
3665 }
3666 /* generate replacement (temporarily (mis)uses p) */
3667 for (p = collstart; p < collend; ++p) {
3668 str += sprintf(str, "&#%d;", (int)*p);
3669 }
3670 p = collend;
3671 break;
3672 default:
3673 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3674 encoding, reason, startp, size, &exc,
3675 collstart-startp, collend-startp, &newpos);
3676 if (repunicode == NULL)
3677 goto onError;
3678 /* need more space? (at least enough for what we
3679 have+the replacement+the rest of the string, so
3680 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003681 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 repsize = PyUnicode_GET_SIZE(repunicode);
3683 requiredsize = respos+repsize+(endp-collend);
3684 if (requiredsize > ressize) {
3685 if (requiredsize<2*ressize)
3686 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003687 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 Py_DECREF(repunicode);
3689 goto onError;
3690 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003691 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 ressize = requiredsize;
3693 }
3694 /* check if there is anything unencodable in the replacement
3695 and copy it to the output */
3696 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3697 c = *uni2;
3698 if (c >= limit) {
3699 raise_encode_exception(&exc, encoding, startp, size,
3700 unicodepos, unicodepos+1, reason);
3701 Py_DECREF(repunicode);
3702 goto onError;
3703 }
3704 *str = (char)c;
3705 }
3706 p = startp + newpos;
3707 Py_DECREF(repunicode);
3708 }
3709 }
3710 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003711 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3712 str - PyBytes_AS_STRING(res));
3713 onError:
3714 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715 Py_XDECREF(errorHandler);
3716 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003717 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718}
3719
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003721 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722 const char *errors)
3723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725}
3726
3727PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3728{
3729 if (!PyUnicode_Check(unicode)) {
3730 PyErr_BadArgument();
3731 return NULL;
3732 }
3733 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3734 PyUnicode_GET_SIZE(unicode),
3735 NULL);
3736}
3737
3738/* --- 7-bit ASCII Codec -------------------------------------------------- */
3739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003741 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 const char *errors)
3743{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 PyUnicodeObject *v;
3746 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003747 Py_ssize_t startinpos;
3748 Py_ssize_t endinpos;
3749 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 const char *e;
3751 PyObject *errorHandler = NULL;
3752 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003753
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003755 if (size == 1 && *(unsigned char*)s < 128) {
3756 Py_UNICODE r = *(unsigned char*)s;
3757 return PyUnicode_FromUnicode(&r, 1);
3758 }
Tim Petersced69f82003-09-16 20:30:58 +00003759
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 v = _PyUnicode_New(size);
3761 if (v == NULL)
3762 goto onError;
3763 if (size == 0)
3764 return (PyObject *)v;
3765 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 e = s + size;
3767 while (s < e) {
3768 register unsigned char c = (unsigned char)*s;
3769 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 ++s;
3772 }
3773 else {
3774 startinpos = s-starts;
3775 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003776 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 if (unicode_decode_call_errorhandler(
3778 errors, &errorHandler,
3779 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003780 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003785 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003786 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003787 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 Py_XDECREF(errorHandler);
3789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 onError:
3793 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 Py_XDECREF(errorHandler);
3795 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 return NULL;
3797}
3798
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003800 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 const char *errors)
3802{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003803 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804}
3805
3806PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3807{
3808 if (!PyUnicode_Check(unicode)) {
3809 PyErr_BadArgument();
3810 return NULL;
3811 }
3812 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3813 PyUnicode_GET_SIZE(unicode),
3814 NULL);
3815}
3816
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003817#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003818
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003819/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003820
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003821#if SIZEOF_INT < SIZEOF_SSIZE_T
3822#define NEED_RETRY
3823#endif
3824
3825/* XXX This code is limited to "true" double-byte encodings, as
3826 a) it assumes an incomplete character consists of a single byte, and
3827 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3828 encodings, see IsDBCSLeadByteEx documentation. */
3829
3830static int is_dbcs_lead_byte(const char *s, int offset)
3831{
3832 const char *curr = s + offset;
3833
3834 if (IsDBCSLeadByte(*curr)) {
3835 const char *prev = CharPrev(s, curr);
3836 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3837 }
3838 return 0;
3839}
3840
3841/*
3842 * Decode MBCS string into unicode object. If 'final' is set, converts
3843 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3844 */
3845static int decode_mbcs(PyUnicodeObject **v,
3846 const char *s, /* MBCS string */
3847 int size, /* sizeof MBCS string */
3848 int final)
3849{
3850 Py_UNICODE *p;
3851 Py_ssize_t n = 0;
3852 int usize = 0;
3853
3854 assert(size >= 0);
3855
3856 /* Skip trailing lead-byte unless 'final' is set */
3857 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3858 --size;
3859
3860 /* First get the size of the result */
3861 if (size > 0) {
3862 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3863 if (usize == 0) {
3864 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3865 return -1;
3866 }
3867 }
3868
3869 if (*v == NULL) {
3870 /* Create unicode object */
3871 *v = _PyUnicode_New(usize);
3872 if (*v == NULL)
3873 return -1;
3874 }
3875 else {
3876 /* Extend unicode object */
3877 n = PyUnicode_GET_SIZE(*v);
3878 if (_PyUnicode_Resize(v, n + usize) < 0)
3879 return -1;
3880 }
3881
3882 /* Do the conversion */
3883 if (size > 0) {
3884 p = PyUnicode_AS_UNICODE(*v) + n;
3885 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3886 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3887 return -1;
3888 }
3889 }
3890
3891 return size;
3892}
3893
3894PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3895 Py_ssize_t size,
3896 const char *errors,
3897 Py_ssize_t *consumed)
3898{
3899 PyUnicodeObject *v = NULL;
3900 int done;
3901
3902 if (consumed)
3903 *consumed = 0;
3904
3905#ifdef NEED_RETRY
3906 retry:
3907 if (size > INT_MAX)
3908 done = decode_mbcs(&v, s, INT_MAX, 0);
3909 else
3910#endif
3911 done = decode_mbcs(&v, s, (int)size, !consumed);
3912
3913 if (done < 0) {
3914 Py_XDECREF(v);
3915 return NULL;
3916 }
3917
3918 if (consumed)
3919 *consumed += done;
3920
3921#ifdef NEED_RETRY
3922 if (size > INT_MAX) {
3923 s += done;
3924 size -= done;
3925 goto retry;
3926 }
3927#endif
3928
3929 return (PyObject *)v;
3930}
3931
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003932PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003933 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003934 const char *errors)
3935{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003936 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3937}
3938
3939/*
3940 * Convert unicode into string object (MBCS).
3941 * Returns 0 if succeed, -1 otherwise.
3942 */
3943static int encode_mbcs(PyObject **repr,
3944 const Py_UNICODE *p, /* unicode */
3945 int size) /* size of unicode */
3946{
3947 int mbcssize = 0;
3948 Py_ssize_t n = 0;
3949
3950 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003951
3952 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003953 if (size > 0) {
3954 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3955 if (mbcssize == 0) {
3956 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3957 return -1;
3958 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003959 }
3960
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003961 if (*repr == NULL) {
3962 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003963 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003964 if (*repr == NULL)
3965 return -1;
3966 }
3967 else {
3968 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003969 n = PyString_Size(*repr);
3970 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003971 return -1;
3972 }
3973
3974 /* Do the conversion */
3975 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003976 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003977 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3978 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3979 return -1;
3980 }
3981 }
3982
3983 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003984}
3985
3986PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003987 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003988 const char *errors)
3989{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003990 PyObject *repr = NULL;
3991 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003992
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003993#ifdef NEED_RETRY
3994 retry:
3995 if (size > INT_MAX)
3996 ret = encode_mbcs(&repr, p, INT_MAX);
3997 else
3998#endif
3999 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004000
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004001 if (ret < 0) {
4002 Py_XDECREF(repr);
4003 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004004 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004005
4006#ifdef NEED_RETRY
4007 if (size > INT_MAX) {
4008 p += INT_MAX;
4009 size -= INT_MAX;
4010 goto retry;
4011 }
4012#endif
4013
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004014 return repr;
4015}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004016
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004017PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4018{
4019 if (!PyUnicode_Check(unicode)) {
4020 PyErr_BadArgument();
4021 return NULL;
4022 }
4023 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4024 PyUnicode_GET_SIZE(unicode),
4025 NULL);
4026}
4027
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004028#undef NEED_RETRY
4029
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004030#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004031
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032/* --- Character Mapping Codec -------------------------------------------- */
4033
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 PyObject *mapping,
4037 const char *errors)
4038{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004040 Py_ssize_t startinpos;
4041 Py_ssize_t endinpos;
4042 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 PyUnicodeObject *v;
4045 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004046 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 PyObject *errorHandler = NULL;
4048 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004049 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004050 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 /* Default to Latin-1 */
4053 if (mapping == NULL)
4054 return PyUnicode_DecodeLatin1(s, size, errors);
4055
4056 v = _PyUnicode_New(size);
4057 if (v == NULL)
4058 goto onError;
4059 if (size == 0)
4060 return (PyObject *)v;
4061 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004063 if (PyUnicode_CheckExact(mapping)) {
4064 mapstring = PyUnicode_AS_UNICODE(mapping);
4065 maplen = PyUnicode_GET_SIZE(mapping);
4066 while (s < e) {
4067 unsigned char ch = *s;
4068 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004070 if (ch < maplen)
4071 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004073 if (x == 0xfffe) {
4074 /* undefined mapping */
4075 outpos = p-PyUnicode_AS_UNICODE(v);
4076 startinpos = s-starts;
4077 endinpos = startinpos+1;
4078 if (unicode_decode_call_errorhandler(
4079 errors, &errorHandler,
4080 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004081 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004082 (PyObject **)&v, &outpos, &p)) {
4083 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004084 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004085 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004086 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004087 *p++ = x;
4088 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004090 }
4091 else {
4092 while (s < e) {
4093 unsigned char ch = *s;
4094 PyObject *w, *x;
4095
4096 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004097 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004098 if (w == NULL)
4099 goto onError;
4100 x = PyObject_GetItem(mapping, w);
4101 Py_DECREF(w);
4102 if (x == NULL) {
4103 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4104 /* No mapping found means: mapping is undefined. */
4105 PyErr_Clear();
4106 x = Py_None;
4107 Py_INCREF(x);
4108 } else
4109 goto onError;
4110 }
4111
4112 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004113 if (PyLong_Check(x)) {
4114 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004115 if (value < 0 || value > 65535) {
4116 PyErr_SetString(PyExc_TypeError,
4117 "character mapping must be in range(65536)");
4118 Py_DECREF(x);
4119 goto onError;
4120 }
4121 *p++ = (Py_UNICODE)value;
4122 }
4123 else if (x == Py_None) {
4124 /* undefined mapping */
4125 outpos = p-PyUnicode_AS_UNICODE(v);
4126 startinpos = s-starts;
4127 endinpos = startinpos+1;
4128 if (unicode_decode_call_errorhandler(
4129 errors, &errorHandler,
4130 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004131 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004132 (PyObject **)&v, &outpos, &p)) {
4133 Py_DECREF(x);
4134 goto onError;
4135 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004136 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004137 continue;
4138 }
4139 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004140 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004141
4142 if (targetsize == 1)
4143 /* 1-1 mapping */
4144 *p++ = *PyUnicode_AS_UNICODE(x);
4145
4146 else if (targetsize > 1) {
4147 /* 1-n mapping */
4148 if (targetsize > extrachars) {
4149 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004150 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4151 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004152 (targetsize << 2);
4153 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004154 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004155 if (_PyUnicode_Resize(&v,
4156 PyUnicode_GET_SIZE(v) + needed) < 0) {
4157 Py_DECREF(x);
4158 goto onError;
4159 }
4160 p = PyUnicode_AS_UNICODE(v) + oldpos;
4161 }
4162 Py_UNICODE_COPY(p,
4163 PyUnicode_AS_UNICODE(x),
4164 targetsize);
4165 p += targetsize;
4166 extrachars -= targetsize;
4167 }
4168 /* 1-0 mapping: skip the character */
4169 }
4170 else {
4171 /* wrong return value */
4172 PyErr_SetString(PyExc_TypeError,
4173 "character mapping must return integer, None or unicode");
4174 Py_DECREF(x);
4175 goto onError;
4176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004178 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 }
4181 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004182 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 Py_XDECREF(errorHandler);
4185 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004187
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 Py_XDECREF(errorHandler);
4190 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 Py_XDECREF(v);
4192 return NULL;
4193}
4194
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004195/* Charmap encoding: the lookup table */
4196
4197struct encoding_map{
4198 PyObject_HEAD
4199 unsigned char level1[32];
4200 int count2, count3;
4201 unsigned char level23[1];
4202};
4203
4204static PyObject*
4205encoding_map_size(PyObject *obj, PyObject* args)
4206{
4207 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004208 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004209 128*map->count3);
4210}
4211
4212static PyMethodDef encoding_map_methods[] = {
4213 {"size", encoding_map_size, METH_NOARGS,
4214 PyDoc_STR("Return the size (in bytes) of this object") },
4215 { 0 }
4216};
4217
4218static void
4219encoding_map_dealloc(PyObject* o)
4220{
4221 PyObject_FREE(o);
4222}
4223
4224static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004225 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004226 "EncodingMap", /*tp_name*/
4227 sizeof(struct encoding_map), /*tp_basicsize*/
4228 0, /*tp_itemsize*/
4229 /* methods */
4230 encoding_map_dealloc, /*tp_dealloc*/
4231 0, /*tp_print*/
4232 0, /*tp_getattr*/
4233 0, /*tp_setattr*/
4234 0, /*tp_compare*/
4235 0, /*tp_repr*/
4236 0, /*tp_as_number*/
4237 0, /*tp_as_sequence*/
4238 0, /*tp_as_mapping*/
4239 0, /*tp_hash*/
4240 0, /*tp_call*/
4241 0, /*tp_str*/
4242 0, /*tp_getattro*/
4243 0, /*tp_setattro*/
4244 0, /*tp_as_buffer*/
4245 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4246 0, /*tp_doc*/
4247 0, /*tp_traverse*/
4248 0, /*tp_clear*/
4249 0, /*tp_richcompare*/
4250 0, /*tp_weaklistoffset*/
4251 0, /*tp_iter*/
4252 0, /*tp_iternext*/
4253 encoding_map_methods, /*tp_methods*/
4254 0, /*tp_members*/
4255 0, /*tp_getset*/
4256 0, /*tp_base*/
4257 0, /*tp_dict*/
4258 0, /*tp_descr_get*/
4259 0, /*tp_descr_set*/
4260 0, /*tp_dictoffset*/
4261 0, /*tp_init*/
4262 0, /*tp_alloc*/
4263 0, /*tp_new*/
4264 0, /*tp_free*/
4265 0, /*tp_is_gc*/
4266};
4267
4268PyObject*
4269PyUnicode_BuildEncodingMap(PyObject* string)
4270{
4271 Py_UNICODE *decode;
4272 PyObject *result;
4273 struct encoding_map *mresult;
4274 int i;
4275 int need_dict = 0;
4276 unsigned char level1[32];
4277 unsigned char level2[512];
4278 unsigned char *mlevel1, *mlevel2, *mlevel3;
4279 int count2 = 0, count3 = 0;
4280
4281 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4282 PyErr_BadArgument();
4283 return NULL;
4284 }
4285 decode = PyUnicode_AS_UNICODE(string);
4286 memset(level1, 0xFF, sizeof level1);
4287 memset(level2, 0xFF, sizeof level2);
4288
4289 /* If there isn't a one-to-one mapping of NULL to \0,
4290 or if there are non-BMP characters, we need to use
4291 a mapping dictionary. */
4292 if (decode[0] != 0)
4293 need_dict = 1;
4294 for (i = 1; i < 256; i++) {
4295 int l1, l2;
4296 if (decode[i] == 0
4297 #ifdef Py_UNICODE_WIDE
4298 || decode[i] > 0xFFFF
4299 #endif
4300 ) {
4301 need_dict = 1;
4302 break;
4303 }
4304 if (decode[i] == 0xFFFE)
4305 /* unmapped character */
4306 continue;
4307 l1 = decode[i] >> 11;
4308 l2 = decode[i] >> 7;
4309 if (level1[l1] == 0xFF)
4310 level1[l1] = count2++;
4311 if (level2[l2] == 0xFF)
4312 level2[l2] = count3++;
4313 }
4314
4315 if (count2 >= 0xFF || count3 >= 0xFF)
4316 need_dict = 1;
4317
4318 if (need_dict) {
4319 PyObject *result = PyDict_New();
4320 PyObject *key, *value;
4321 if (!result)
4322 return NULL;
4323 for (i = 0; i < 256; i++) {
4324 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004325 key = PyLong_FromLong(decode[i]);
4326 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004327 if (!key || !value)
4328 goto failed1;
4329 if (PyDict_SetItem(result, key, value) == -1)
4330 goto failed1;
4331 Py_DECREF(key);
4332 Py_DECREF(value);
4333 }
4334 return result;
4335 failed1:
4336 Py_XDECREF(key);
4337 Py_XDECREF(value);
4338 Py_DECREF(result);
4339 return NULL;
4340 }
4341
4342 /* Create a three-level trie */
4343 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4344 16*count2 + 128*count3 - 1);
4345 if (!result)
4346 return PyErr_NoMemory();
4347 PyObject_Init(result, &EncodingMapType);
4348 mresult = (struct encoding_map*)result;
4349 mresult->count2 = count2;
4350 mresult->count3 = count3;
4351 mlevel1 = mresult->level1;
4352 mlevel2 = mresult->level23;
4353 mlevel3 = mresult->level23 + 16*count2;
4354 memcpy(mlevel1, level1, 32);
4355 memset(mlevel2, 0xFF, 16*count2);
4356 memset(mlevel3, 0, 128*count3);
4357 count3 = 0;
4358 for (i = 1; i < 256; i++) {
4359 int o1, o2, o3, i2, i3;
4360 if (decode[i] == 0xFFFE)
4361 /* unmapped character */
4362 continue;
4363 o1 = decode[i]>>11;
4364 o2 = (decode[i]>>7) & 0xF;
4365 i2 = 16*mlevel1[o1] + o2;
4366 if (mlevel2[i2] == 0xFF)
4367 mlevel2[i2] = count3++;
4368 o3 = decode[i] & 0x7F;
4369 i3 = 128*mlevel2[i2] + o3;
4370 mlevel3[i3] = i;
4371 }
4372 return result;
4373}
4374
4375static int
4376encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4377{
4378 struct encoding_map *map = (struct encoding_map*)mapping;
4379 int l1 = c>>11;
4380 int l2 = (c>>7) & 0xF;
4381 int l3 = c & 0x7F;
4382 int i;
4383
4384#ifdef Py_UNICODE_WIDE
4385 if (c > 0xFFFF) {
4386 return -1;
4387 }
4388#endif
4389 if (c == 0)
4390 return 0;
4391 /* level 1*/
4392 i = map->level1[l1];
4393 if (i == 0xFF) {
4394 return -1;
4395 }
4396 /* level 2*/
4397 i = map->level23[16*i+l2];
4398 if (i == 0xFF) {
4399 return -1;
4400 }
4401 /* level 3 */
4402 i = map->level23[16*map->count2 + 128*i + l3];
4403 if (i == 0) {
4404 return -1;
4405 }
4406 return i;
4407}
4408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409/* Lookup the character ch in the mapping. If the character
4410 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004411 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413{
Christian Heimes217cfd12007-12-02 14:31:20 +00004414 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 PyObject *x;
4416
4417 if (w == NULL)
4418 return NULL;
4419 x = PyObject_GetItem(mapping, w);
4420 Py_DECREF(w);
4421 if (x == NULL) {
4422 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4423 /* No mapping found means: mapping is undefined. */
4424 PyErr_Clear();
4425 x = Py_None;
4426 Py_INCREF(x);
4427 return x;
4428 } else
4429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004431 else if (x == Py_None)
4432 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004433 else if (PyLong_Check(x)) {
4434 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 if (value < 0 || value > 255) {
4436 PyErr_SetString(PyExc_TypeError,
4437 "character mapping must be in range(256)");
4438 Py_DECREF(x);
4439 return NULL;
4440 }
4441 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 else if (PyString_Check(x))
4444 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004447 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004448 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004449 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 Py_DECREF(x);
4451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 }
4453}
4454
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004455static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004456charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004457{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004458 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004459 /* exponentially overallocate to minimize reallocations */
4460 if (requiredsize < 2*outsize)
4461 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004462 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004463 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004464 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004465}
4466
4467typedef enum charmapencode_result {
4468 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4469}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004471 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 space is available. Return a new reference to the object that
4473 was put in the output buffer, or Py_None, if the mapping was undefined
4474 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004475 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004477charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004478 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004480 PyObject *rep;
4481 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004482 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483
Christian Heimes90aa7642007-12-19 02:45:37 +00004484 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004485 int res = encoding_map_lookup(c, mapping);
4486 Py_ssize_t requiredsize = *outpos+1;
4487 if (res == -1)
4488 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004489 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004490 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004491 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004492 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004493 outstart[(*outpos)++] = (char)res;
4494 return enc_SUCCESS;
4495 }
4496
4497 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004499 return enc_EXCEPTION;
4500 else if (rep==Py_None) {
4501 Py_DECREF(rep);
4502 return enc_FAILED;
4503 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004504 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004505 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004506 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004507 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004509 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004511 outstart = PyString_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004512 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 }
4514 else {
4515 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004516 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4517 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004518 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004519 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004521 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004523 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 memcpy(outstart + *outpos, repchars, repsize);
4525 *outpos += repsize;
4526 }
4527 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004528 Py_DECREF(rep);
4529 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530}
4531
4532/* handle an error in PyUnicode_EncodeCharmap
4533 Return 0 on success, -1 on error */
4534static
4535int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004536 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004538 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004539 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540{
4541 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004542 Py_ssize_t repsize;
4543 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 Py_UNICODE *uni2;
4545 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004546 Py_ssize_t collstartpos = *inpos;
4547 Py_ssize_t collendpos = *inpos+1;
4548 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 char *encoding = "charmap";
4550 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004551 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 /* find all unencodable characters */
4554 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004555 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004556 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004557 int res = encoding_map_lookup(p[collendpos], mapping);
4558 if (res != -1)
4559 break;
4560 ++collendpos;
4561 continue;
4562 }
4563
4564 rep = charmapencode_lookup(p[collendpos], mapping);
4565 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004567 else if (rep!=Py_None) {
4568 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 break;
4570 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004571 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 ++collendpos;
4573 }
4574 /* cache callback name lookup
4575 * (if not done yet, i.e. it's the first error) */
4576 if (*known_errorHandler==-1) {
4577 if ((errors==NULL) || (!strcmp(errors, "strict")))
4578 *known_errorHandler = 1;
4579 else if (!strcmp(errors, "replace"))
4580 *known_errorHandler = 2;
4581 else if (!strcmp(errors, "ignore"))
4582 *known_errorHandler = 3;
4583 else if (!strcmp(errors, "xmlcharrefreplace"))
4584 *known_errorHandler = 4;
4585 else
4586 *known_errorHandler = 0;
4587 }
4588 switch (*known_errorHandler) {
4589 case 1: /* strict */
4590 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4591 return -1;
4592 case 2: /* replace */
4593 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4594 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004595 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 return -1;
4597 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004598 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4600 return -1;
4601 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 }
4603 /* fall through */
4604 case 3: /* ignore */
4605 *inpos = collendpos;
4606 break;
4607 case 4: /* xmlcharrefreplace */
4608 /* generate replacement (temporarily (mis)uses p) */
4609 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4610 char buffer[2+29+1+1];
4611 char *cp;
4612 sprintf(buffer, "&#%d;", (int)p[collpos]);
4613 for (cp = buffer; *cp; ++cp) {
4614 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004615 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004617 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619 return -1;
4620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 }
4622 }
4623 *inpos = collendpos;
4624 break;
4625 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004626 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 encoding, reason, p, size, exceptionObject,
4628 collstartpos, collendpos, &newpos);
4629 if (repunicode == NULL)
4630 return -1;
4631 /* generate replacement */
4632 repsize = PyUnicode_GET_SIZE(repunicode);
4633 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4634 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004635 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 return -1;
4637 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004638 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4641 return -1;
4642 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 }
4644 *inpos = newpos;
4645 Py_DECREF(repunicode);
4646 }
4647 return 0;
4648}
4649
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 PyObject *mapping,
4653 const char *errors)
4654{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 /* output object */
4656 PyObject *res = NULL;
4657 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 PyObject *errorHandler = NULL;
4662 PyObject *exc = NULL;
4663 /* the following variable is used for caching string comparisons
4664 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4665 * 3=ignore, 4=xmlcharrefreplace */
4666 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667
4668 /* Default to Latin-1 */
4669 if (mapping == NULL)
4670 return PyUnicode_EncodeLatin1(p, size, errors);
4671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 /* allocate enough for a simple encoding without
4673 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004674 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 if (res == NULL)
4676 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004677 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 while (inpos<size) {
4681 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004682 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004683 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004685 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 if (charmap_encoding_error(p, size, &inpos, mapping,
4687 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004688 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004689 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004690 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 else
4694 /* done with this character => adjust input position */
4695 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004699 if (respos<PyString_GET_SIZE(res))
4700 _PyString_Resize(&res, respos);
4701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 Py_XDECREF(exc);
4703 Py_XDECREF(errorHandler);
4704 return res;
4705
4706 onError:
4707 Py_XDECREF(res);
4708 Py_XDECREF(exc);
4709 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 return NULL;
4711}
4712
4713PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4714 PyObject *mapping)
4715{
4716 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4717 PyErr_BadArgument();
4718 return NULL;
4719 }
4720 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4721 PyUnicode_GET_SIZE(unicode),
4722 mapping,
4723 NULL);
4724}
4725
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726/* create or adjust a UnicodeTranslateError */
4727static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004728 const Py_UNICODE *unicode, Py_ssize_t size,
4729 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 if (*exceptionObject == NULL) {
4733 *exceptionObject = PyUnicodeTranslateError_Create(
4734 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 }
4736 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4738 goto onError;
4739 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4740 goto onError;
4741 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4742 goto onError;
4743 return;
4744 onError:
4745 Py_DECREF(*exceptionObject);
4746 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 }
4748}
4749
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750/* raises a UnicodeTranslateError */
4751static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004752 const Py_UNICODE *unicode, Py_ssize_t size,
4753 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 const char *reason)
4755{
4756 make_translate_exception(exceptionObject,
4757 unicode, size, startpos, endpos, reason);
4758 if (*exceptionObject != NULL)
4759 PyCodec_StrictErrors(*exceptionObject);
4760}
4761
4762/* error handling callback helper:
4763 build arguments, call the callback and check the arguments,
4764 put the result into newpos and return the replacement string, which
4765 has to be freed by the caller */
4766static PyObject *unicode_translate_call_errorhandler(const char *errors,
4767 PyObject **errorHandler,
4768 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4770 Py_ssize_t startpos, Py_ssize_t endpos,
4771 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004773 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004775 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 PyObject *restuple;
4777 PyObject *resunicode;
4778
4779 if (*errorHandler == NULL) {
4780 *errorHandler = PyCodec_LookupError(errors);
4781 if (*errorHandler == NULL)
4782 return NULL;
4783 }
4784
4785 make_translate_exception(exceptionObject,
4786 unicode, size, startpos, endpos, reason);
4787 if (*exceptionObject == NULL)
4788 return NULL;
4789
4790 restuple = PyObject_CallFunctionObjArgs(
4791 *errorHandler, *exceptionObject, NULL);
4792 if (restuple == NULL)
4793 return NULL;
4794 if (!PyTuple_Check(restuple)) {
4795 PyErr_Format(PyExc_TypeError, &argparse[4]);
4796 Py_DECREF(restuple);
4797 return NULL;
4798 }
4799 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004800 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 Py_DECREF(restuple);
4802 return NULL;
4803 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004804 if (i_newpos<0)
4805 *newpos = size+i_newpos;
4806 else
4807 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004808 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004809 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004810 Py_DECREF(restuple);
4811 return NULL;
4812 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 Py_INCREF(resunicode);
4814 Py_DECREF(restuple);
4815 return resunicode;
4816}
4817
4818/* Lookup the character ch in the mapping and put the result in result,
4819 which must be decrefed by the caller.
4820 Return 0 on success, -1 on error */
4821static
4822int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4823{
Christian Heimes217cfd12007-12-02 14:31:20 +00004824 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004825 PyObject *x;
4826
4827 if (w == NULL)
4828 return -1;
4829 x = PyObject_GetItem(mapping, w);
4830 Py_DECREF(w);
4831 if (x == NULL) {
4832 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4833 /* No mapping found means: use 1:1 mapping. */
4834 PyErr_Clear();
4835 *result = NULL;
4836 return 0;
4837 } else
4838 return -1;
4839 }
4840 else if (x == Py_None) {
4841 *result = x;
4842 return 0;
4843 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004844 else if (PyLong_Check(x)) {
4845 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 long max = PyUnicode_GetMax();
4847 if (value < 0 || value > max) {
4848 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004849 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 Py_DECREF(x);
4851 return -1;
4852 }
4853 *result = x;
4854 return 0;
4855 }
4856 else if (PyUnicode_Check(x)) {
4857 *result = x;
4858 return 0;
4859 }
4860 else {
4861 /* wrong return value */
4862 PyErr_SetString(PyExc_TypeError,
4863 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004864 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 return -1;
4866 }
4867}
4868/* ensure that *outobj is at least requiredsize characters long,
4869if not reallocate and adjust various state variables.
4870Return 0 on success, -1 on error */
4871static
Walter Dörwald4894c302003-10-24 14:25:28 +00004872int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004875 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004876 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004878 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004880 if (requiredsize < 2 * oldsize)
4881 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004882 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 return -1;
4884 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 }
4886 return 0;
4887}
4888/* lookup the character, put the result in the output string and adjust
4889 various state variables. Return a new reference to the object that
4890 was put in the output buffer in *result, or Py_None, if the mapping was
4891 undefined (in which case no character was written).
4892 The called must decref result.
4893 Return 0 on success, -1 on error. */
4894static
Walter Dörwald4894c302003-10-24 14:25:28 +00004895int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004897 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898{
Walter Dörwald4894c302003-10-24 14:25:28 +00004899 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 return -1;
4901 if (*res==NULL) {
4902 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004903 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 }
4905 else if (*res==Py_None)
4906 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00004907 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00004909 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 }
4911 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004912 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 if (repsize==1) {
4914 /* no overflow check, because we know that the space is enough */
4915 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4916 }
4917 else if (repsize!=0) {
4918 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004919 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004920 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004921 repsize - 1;
4922 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 return -1;
4924 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4925 *outp += repsize;
4926 }
4927 }
4928 else
4929 return -1;
4930 return 0;
4931}
4932
4933PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004934 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 PyObject *mapping,
4936 const char *errors)
4937{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 /* output object */
4939 PyObject *res = NULL;
4940 /* pointers to the beginning and end+1 of input */
4941 const Py_UNICODE *startp = p;
4942 const Py_UNICODE *endp = p + size;
4943 /* pointer into the output */
4944 Py_UNICODE *str;
4945 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004946 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 char *reason = "character maps to <undefined>";
4948 PyObject *errorHandler = NULL;
4949 PyObject *exc = NULL;
4950 /* the following variable is used for caching string comparisons
4951 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4952 * 3=ignore, 4=xmlcharrefreplace */
4953 int known_errorHandler = -1;
4954
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 if (mapping == NULL) {
4956 PyErr_BadArgument();
4957 return NULL;
4958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959
4960 /* allocate enough for a simple 1:1 translation without
4961 replacements, if we need more, we'll resize */
4962 res = PyUnicode_FromUnicode(NULL, size);
4963 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004964 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 return res;
4967 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 while (p<endp) {
4970 /* try to encode it */
4971 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004972 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 goto onError;
4975 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004976 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 if (x!=Py_None) /* it worked => adjust input pointer */
4978 ++p;
4979 else { /* untranslatable character */
4980 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004981 Py_ssize_t repsize;
4982 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983 Py_UNICODE *uni2;
4984 /* startpos for collecting untranslatable chars */
4985 const Py_UNICODE *collstart = p;
4986 const Py_UNICODE *collend = p+1;
4987 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 /* find all untranslatable characters */
4990 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004991 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 goto onError;
4993 Py_XDECREF(x);
4994 if (x!=Py_None)
4995 break;
4996 ++collend;
4997 }
4998 /* cache callback name lookup
4999 * (if not done yet, i.e. it's the first error) */
5000 if (known_errorHandler==-1) {
5001 if ((errors==NULL) || (!strcmp(errors, "strict")))
5002 known_errorHandler = 1;
5003 else if (!strcmp(errors, "replace"))
5004 known_errorHandler = 2;
5005 else if (!strcmp(errors, "ignore"))
5006 known_errorHandler = 3;
5007 else if (!strcmp(errors, "xmlcharrefreplace"))
5008 known_errorHandler = 4;
5009 else
5010 known_errorHandler = 0;
5011 }
5012 switch (known_errorHandler) {
5013 case 1: /* strict */
5014 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5015 goto onError;
5016 case 2: /* replace */
5017 /* No need to check for space, this is a 1:1 replacement */
5018 for (coll = collstart; coll<collend; ++coll)
5019 *str++ = '?';
5020 /* fall through */
5021 case 3: /* ignore */
5022 p = collend;
5023 break;
5024 case 4: /* xmlcharrefreplace */
5025 /* generate replacement (temporarily (mis)uses p) */
5026 for (p = collstart; p < collend; ++p) {
5027 char buffer[2+29+1+1];
5028 char *cp;
5029 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005030 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5032 goto onError;
5033 for (cp = buffer; *cp; ++cp)
5034 *str++ = *cp;
5035 }
5036 p = collend;
5037 break;
5038 default:
5039 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5040 reason, startp, size, &exc,
5041 collstart-startp, collend-startp, &newpos);
5042 if (repunicode == NULL)
5043 goto onError;
5044 /* generate replacement */
5045 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005046 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5048 Py_DECREF(repunicode);
5049 goto onError;
5050 }
5051 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5052 *str++ = *uni2;
5053 p = startp + newpos;
5054 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 }
5056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 /* Resize if we allocated to much */
5059 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005060 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005061 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005063 }
5064 Py_XDECREF(exc);
5065 Py_XDECREF(errorHandler);
5066 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068 onError:
5069 Py_XDECREF(res);
5070 Py_XDECREF(exc);
5071 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 return NULL;
5073}
5074
5075PyObject *PyUnicode_Translate(PyObject *str,
5076 PyObject *mapping,
5077 const char *errors)
5078{
5079 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005080
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 str = PyUnicode_FromObject(str);
5082 if (str == NULL)
5083 goto onError;
5084 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5085 PyUnicode_GET_SIZE(str),
5086 mapping,
5087 errors);
5088 Py_DECREF(str);
5089 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005090
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 onError:
5092 Py_XDECREF(str);
5093 return NULL;
5094}
Tim Petersced69f82003-09-16 20:30:58 +00005095
Guido van Rossum9e896b32000-04-05 20:11:21 +00005096/* --- Decimal Encoder ---------------------------------------------------- */
5097
5098int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005099 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005100 char *output,
5101 const char *errors)
5102{
5103 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104 PyObject *errorHandler = NULL;
5105 PyObject *exc = NULL;
5106 const char *encoding = "decimal";
5107 const char *reason = "invalid decimal Unicode string";
5108 /* the following variable is used for caching string comparisons
5109 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5110 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005111
5112 if (output == NULL) {
5113 PyErr_BadArgument();
5114 return -1;
5115 }
5116
5117 p = s;
5118 end = s + length;
5119 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005120 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005121 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005123 Py_ssize_t repsize;
5124 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 Py_UNICODE *uni2;
5126 Py_UNICODE *collstart;
5127 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005128
Guido van Rossum9e896b32000-04-05 20:11:21 +00005129 if (Py_UNICODE_ISSPACE(ch)) {
5130 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005131 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005132 continue;
5133 }
5134 decimal = Py_UNICODE_TODECIMAL(ch);
5135 if (decimal >= 0) {
5136 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005138 continue;
5139 }
Guido van Rossumba477042000-04-06 18:18:10 +00005140 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005141 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005142 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005143 continue;
5144 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005145 /* All other characters are considered unencodable */
5146 collstart = p;
5147 collend = p+1;
5148 while (collend < end) {
5149 if ((0 < *collend && *collend < 256) ||
5150 !Py_UNICODE_ISSPACE(*collend) ||
5151 Py_UNICODE_TODECIMAL(*collend))
5152 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005154 /* cache callback name lookup
5155 * (if not done yet, i.e. it's the first error) */
5156 if (known_errorHandler==-1) {
5157 if ((errors==NULL) || (!strcmp(errors, "strict")))
5158 known_errorHandler = 1;
5159 else if (!strcmp(errors, "replace"))
5160 known_errorHandler = 2;
5161 else if (!strcmp(errors, "ignore"))
5162 known_errorHandler = 3;
5163 else if (!strcmp(errors, "xmlcharrefreplace"))
5164 known_errorHandler = 4;
5165 else
5166 known_errorHandler = 0;
5167 }
5168 switch (known_errorHandler) {
5169 case 1: /* strict */
5170 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5171 goto onError;
5172 case 2: /* replace */
5173 for (p = collstart; p < collend; ++p)
5174 *output++ = '?';
5175 /* fall through */
5176 case 3: /* ignore */
5177 p = collend;
5178 break;
5179 case 4: /* xmlcharrefreplace */
5180 /* generate replacement (temporarily (mis)uses p) */
5181 for (p = collstart; p < collend; ++p)
5182 output += sprintf(output, "&#%d;", (int)*p);
5183 p = collend;
5184 break;
5185 default:
5186 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5187 encoding, reason, s, length, &exc,
5188 collstart-s, collend-s, &newpos);
5189 if (repunicode == NULL)
5190 goto onError;
5191 /* generate replacement */
5192 repsize = PyUnicode_GET_SIZE(repunicode);
5193 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5194 Py_UNICODE ch = *uni2;
5195 if (Py_UNICODE_ISSPACE(ch))
5196 *output++ = ' ';
5197 else {
5198 decimal = Py_UNICODE_TODECIMAL(ch);
5199 if (decimal >= 0)
5200 *output++ = '0' + decimal;
5201 else if (0 < ch && ch < 256)
5202 *output++ = (char)ch;
5203 else {
5204 Py_DECREF(repunicode);
5205 raise_encode_exception(&exc, encoding,
5206 s, length, collstart-s, collend-s, reason);
5207 goto onError;
5208 }
5209 }
5210 }
5211 p = s + newpos;
5212 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005213 }
5214 }
5215 /* 0-terminate the output string */
5216 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 Py_XDECREF(exc);
5218 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005219 return 0;
5220
5221 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 Py_XDECREF(exc);
5223 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005224 return -1;
5225}
5226
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227/* --- Helpers ------------------------------------------------------------ */
5228
Eric Smith8c663262007-08-25 02:26:07 +00005229#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005230#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005231#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005232/* Include _ParseTupleFinds from find.h */
5233#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005234#include "stringlib/find.h"
5235#include "stringlib/partition.h"
5236
Eric Smith5807c412008-05-11 21:00:57 +00005237#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5238#include "stringlib/localeutil.h"
5239
Thomas Wouters477c8d52006-05-27 19:21:47 +00005240/* helper macro to fixup start/end slice values */
5241#define FIX_START_END(obj) \
5242 if (start < 0) \
5243 start += (obj)->length; \
5244 if (start < 0) \
5245 start = 0; \
5246 if (end > (obj)->length) \
5247 end = (obj)->length; \
5248 if (end < 0) \
5249 end += (obj)->length; \
5250 if (end < 0) \
5251 end = 0;
5252
Martin v. Löwis18e16552006-02-15 17:27:45 +00005253Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005254 PyObject *substr,
5255 Py_ssize_t start,
5256 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005258 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005259 PyUnicodeObject* str_obj;
5260 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005261
Thomas Wouters477c8d52006-05-27 19:21:47 +00005262 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5263 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005265 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5266 if (!sub_obj) {
5267 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 return -1;
5269 }
Tim Petersced69f82003-09-16 20:30:58 +00005270
Thomas Wouters477c8d52006-05-27 19:21:47 +00005271 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005272
Thomas Wouters477c8d52006-05-27 19:21:47 +00005273 result = stringlib_count(
5274 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5275 );
5276
5277 Py_DECREF(sub_obj);
5278 Py_DECREF(str_obj);
5279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 return result;
5281}
5282
Martin v. Löwis18e16552006-02-15 17:27:45 +00005283Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005284 PyObject *sub,
5285 Py_ssize_t start,
5286 Py_ssize_t end,
5287 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005289 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005290
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005292 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005293 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005294 sub = PyUnicode_FromObject(sub);
5295 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005296 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005297 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 }
Tim Petersced69f82003-09-16 20:30:58 +00005299
Thomas Wouters477c8d52006-05-27 19:21:47 +00005300 if (direction > 0)
5301 result = stringlib_find_slice(
5302 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5303 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5304 start, end
5305 );
5306 else
5307 result = stringlib_rfind_slice(
5308 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5309 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5310 start, end
5311 );
5312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005314 Py_DECREF(sub);
5315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 return result;
5317}
5318
Tim Petersced69f82003-09-16 20:30:58 +00005319static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320int tailmatch(PyUnicodeObject *self,
5321 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005322 Py_ssize_t start,
5323 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 int direction)
5325{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 if (substring->length == 0)
5327 return 1;
5328
Thomas Wouters477c8d52006-05-27 19:21:47 +00005329 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330
5331 end -= substring->length;
5332 if (end < start)
5333 return 0;
5334
5335 if (direction > 0) {
5336 if (Py_UNICODE_MATCH(self, end, substring))
5337 return 1;
5338 } else {
5339 if (Py_UNICODE_MATCH(self, start, substring))
5340 return 1;
5341 }
5342
5343 return 0;
5344}
5345
Martin v. Löwis18e16552006-02-15 17:27:45 +00005346Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005348 Py_ssize_t start,
5349 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 int direction)
5351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005353
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 str = PyUnicode_FromObject(str);
5355 if (str == NULL)
5356 return -1;
5357 substr = PyUnicode_FromObject(substr);
5358 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005359 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 return -1;
5361 }
Tim Petersced69f82003-09-16 20:30:58 +00005362
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 result = tailmatch((PyUnicodeObject *)str,
5364 (PyUnicodeObject *)substr,
5365 start, end, direction);
5366 Py_DECREF(str);
5367 Py_DECREF(substr);
5368 return result;
5369}
5370
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371/* Apply fixfct filter to the Unicode object self and return a
5372 reference to the modified object */
5373
Tim Petersced69f82003-09-16 20:30:58 +00005374static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375PyObject *fixup(PyUnicodeObject *self,
5376 int (*fixfct)(PyUnicodeObject *s))
5377{
5378
5379 PyUnicodeObject *u;
5380
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005381 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 if (u == NULL)
5383 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005384
5385 Py_UNICODE_COPY(u->str, self->str, self->length);
5386
Tim Peters7a29bd52001-09-12 03:03:31 +00005387 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 /* fixfct should return TRUE if it modified the buffer. If
5389 FALSE, return a reference to the original buffer instead
5390 (to save space, not time) */
5391 Py_INCREF(self);
5392 Py_DECREF(u);
5393 return (PyObject*) self;
5394 }
5395 return (PyObject*) u;
5396}
5397
Tim Petersced69f82003-09-16 20:30:58 +00005398static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399int fixupper(PyUnicodeObject *self)
5400{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 Py_UNICODE *s = self->str;
5403 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005404
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 while (len-- > 0) {
5406 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 ch = Py_UNICODE_TOUPPER(*s);
5409 if (ch != *s) {
5410 status = 1;
5411 *s = ch;
5412 }
5413 s++;
5414 }
5415
5416 return status;
5417}
5418
Tim Petersced69f82003-09-16 20:30:58 +00005419static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420int fixlower(PyUnicodeObject *self)
5421{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005422 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 Py_UNICODE *s = self->str;
5424 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005425
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 while (len-- > 0) {
5427 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005428
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 ch = Py_UNICODE_TOLOWER(*s);
5430 if (ch != *s) {
5431 status = 1;
5432 *s = ch;
5433 }
5434 s++;
5435 }
5436
5437 return status;
5438}
5439
Tim Petersced69f82003-09-16 20:30:58 +00005440static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441int fixswapcase(PyUnicodeObject *self)
5442{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005443 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 Py_UNICODE *s = self->str;
5445 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005446
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 while (len-- > 0) {
5448 if (Py_UNICODE_ISUPPER(*s)) {
5449 *s = Py_UNICODE_TOLOWER(*s);
5450 status = 1;
5451 } else if (Py_UNICODE_ISLOWER(*s)) {
5452 *s = Py_UNICODE_TOUPPER(*s);
5453 status = 1;
5454 }
5455 s++;
5456 }
5457
5458 return status;
5459}
5460
Tim Petersced69f82003-09-16 20:30:58 +00005461static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462int fixcapitalize(PyUnicodeObject *self)
5463{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005464 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005465 Py_UNICODE *s = self->str;
5466 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005467
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005468 if (len == 0)
5469 return 0;
5470 if (Py_UNICODE_ISLOWER(*s)) {
5471 *s = Py_UNICODE_TOUPPER(*s);
5472 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005474 s++;
5475 while (--len > 0) {
5476 if (Py_UNICODE_ISUPPER(*s)) {
5477 *s = Py_UNICODE_TOLOWER(*s);
5478 status = 1;
5479 }
5480 s++;
5481 }
5482 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483}
5484
5485static
5486int fixtitle(PyUnicodeObject *self)
5487{
5488 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5489 register Py_UNICODE *e;
5490 int previous_is_cased;
5491
5492 /* Shortcut for single character strings */
5493 if (PyUnicode_GET_SIZE(self) == 1) {
5494 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5495 if (*p != ch) {
5496 *p = ch;
5497 return 1;
5498 }
5499 else
5500 return 0;
5501 }
Tim Petersced69f82003-09-16 20:30:58 +00005502
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 e = p + PyUnicode_GET_SIZE(self);
5504 previous_is_cased = 0;
5505 for (; p < e; p++) {
5506 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005507
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 if (previous_is_cased)
5509 *p = Py_UNICODE_TOLOWER(ch);
5510 else
5511 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005512
5513 if (Py_UNICODE_ISLOWER(ch) ||
5514 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 Py_UNICODE_ISTITLE(ch))
5516 previous_is_cased = 1;
5517 else
5518 previous_is_cased = 0;
5519 }
5520 return 1;
5521}
5522
Tim Peters8ce9f162004-08-27 01:49:32 +00005523PyObject *
5524PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525{
Tim Peters8ce9f162004-08-27 01:49:32 +00005526 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005527 const Py_UNICODE blank = ' ';
5528 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005529 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005530 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005531 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5532 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005533 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5534 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005535 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005536 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005537 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
Tim Peters05eba1f2004-08-27 21:32:02 +00005539 fseq = PySequence_Fast(seq, "");
5540 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005541 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005542 }
5543
Tim Peters91879ab2004-08-27 22:35:44 +00005544 /* Grrrr. A codec may be invoked to convert str objects to
5545 * Unicode, and so it's possible to call back into Python code
5546 * during PyUnicode_FromObject(), and so it's possible for a sick
5547 * codec to change the size of fseq (if seq is a list). Therefore
5548 * we have to keep refetching the size -- can't assume seqlen
5549 * is invariant.
5550 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005551 seqlen = PySequence_Fast_GET_SIZE(fseq);
5552 /* If empty sequence, return u"". */
5553 if (seqlen == 0) {
5554 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5555 goto Done;
5556 }
5557 /* If singleton sequence with an exact Unicode, return that. */
5558 if (seqlen == 1) {
5559 item = PySequence_Fast_GET_ITEM(fseq, 0);
5560 if (PyUnicode_CheckExact(item)) {
5561 Py_INCREF(item);
5562 res = (PyUnicodeObject *)item;
5563 goto Done;
5564 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005565 }
5566
Tim Peters05eba1f2004-08-27 21:32:02 +00005567 /* At least two items to join, or one that isn't exact Unicode. */
5568 if (seqlen > 1) {
5569 /* Set up sep and seplen -- they're needed. */
5570 if (separator == NULL) {
5571 sep = &blank;
5572 seplen = 1;
5573 }
5574 else {
5575 internal_separator = PyUnicode_FromObject(separator);
5576 if (internal_separator == NULL)
5577 goto onError;
5578 sep = PyUnicode_AS_UNICODE(internal_separator);
5579 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005580 /* In case PyUnicode_FromObject() mutated seq. */
5581 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005582 }
5583 }
5584
5585 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005586 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005587 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005588 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005589 res_p = PyUnicode_AS_UNICODE(res);
5590 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005591
Tim Peters05eba1f2004-08-27 21:32:02 +00005592 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005593 Py_ssize_t itemlen;
5594 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005595
5596 item = PySequence_Fast_GET_ITEM(fseq, i);
5597 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005598 if (!PyUnicode_Check(item)) {
5599 PyErr_Format(PyExc_TypeError,
5600 "sequence item %zd: expected str instance,"
5601 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005602 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005603 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005604 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005605 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005606 if (item == NULL)
5607 goto onError;
5608 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005609
Tim Peters91879ab2004-08-27 22:35:44 +00005610 /* In case PyUnicode_FromObject() mutated seq. */
5611 seqlen = PySequence_Fast_GET_SIZE(fseq);
5612
Tim Peters8ce9f162004-08-27 01:49:32 +00005613 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005615 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005616 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005617 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005618 if (i < seqlen - 1) {
5619 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005620 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005621 goto Overflow;
5622 }
5623 if (new_res_used > res_alloc) {
5624 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005625 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005626 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005627 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005628 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005629 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005630 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005631 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005633 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005634 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005636
5637 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005638 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005639 res_p += itemlen;
5640 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005641 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005642 res_p += seplen;
5643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005645 res_used = new_res_used;
5646 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005647
Tim Peters05eba1f2004-08-27 21:32:02 +00005648 /* Shrink res to match the used area; this probably can't fail,
5649 * but it's cheap to check.
5650 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005651 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005652 goto onError;
5653
5654 Done:
5655 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005656 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 return (PyObject *)res;
5658
Tim Peters8ce9f162004-08-27 01:49:32 +00005659 Overflow:
5660 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005661 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005662 Py_DECREF(item);
5663 /* fall through */
5664
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005666 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005667 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005668 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 return NULL;
5670}
5671
Tim Petersced69f82003-09-16 20:30:58 +00005672static
5673PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005674 Py_ssize_t left,
5675 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 Py_UNICODE fill)
5677{
5678 PyUnicodeObject *u;
5679
5680 if (left < 0)
5681 left = 0;
5682 if (right < 0)
5683 right = 0;
5684
Tim Peters7a29bd52001-09-12 03:03:31 +00005685 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 Py_INCREF(self);
5687 return self;
5688 }
5689
5690 u = _PyUnicode_New(left + self->length + right);
5691 if (u) {
5692 if (left)
5693 Py_UNICODE_FILL(u->str, fill, left);
5694 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5695 if (right)
5696 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5697 }
5698
5699 return u;
5700}
5701
5702#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005703 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 if (!str) \
5705 goto onError; \
5706 if (PyList_Append(list, str)) { \
5707 Py_DECREF(str); \
5708 goto onError; \
5709 } \
5710 else \
5711 Py_DECREF(str);
5712
5713static
5714PyObject *split_whitespace(PyUnicodeObject *self,
5715 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005716 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005718 register Py_ssize_t i;
5719 register Py_ssize_t j;
5720 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005722 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
5724 for (i = j = 0; i < len; ) {
5725 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005726 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 i++;
5728 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005729 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 i++;
5731 if (j < i) {
5732 if (maxcount-- <= 0)
5733 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005734 SPLIT_APPEND(buf, j, i);
5735 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 i++;
5737 j = i;
5738 }
5739 }
5740 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005741 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 }
5743 return list;
5744
5745 onError:
5746 Py_DECREF(list);
5747 return NULL;
5748}
5749
5750PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005751 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005753 register Py_ssize_t i;
5754 register Py_ssize_t j;
5755 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 PyObject *list;
5757 PyObject *str;
5758 Py_UNICODE *data;
5759
5760 string = PyUnicode_FromObject(string);
5761 if (string == NULL)
5762 return NULL;
5763 data = PyUnicode_AS_UNICODE(string);
5764 len = PyUnicode_GET_SIZE(string);
5765
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 list = PyList_New(0);
5767 if (!list)
5768 goto onError;
5769
5770 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005771 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005772
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005774 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776
5777 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005778 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 if (i < len) {
5780 if (data[i] == '\r' && i + 1 < len &&
5781 data[i+1] == '\n')
5782 i += 2;
5783 else
5784 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005785 if (keepends)
5786 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 }
Guido van Rossum86662912000-04-11 15:38:46 +00005788 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 j = i;
5790 }
5791 if (j < len) {
5792 SPLIT_APPEND(data, j, len);
5793 }
5794
5795 Py_DECREF(string);
5796 return list;
5797
5798 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005799 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 Py_DECREF(string);
5801 return NULL;
5802}
5803
Tim Petersced69f82003-09-16 20:30:58 +00005804static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805PyObject *split_char(PyUnicodeObject *self,
5806 PyObject *list,
5807 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005808 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005810 register Py_ssize_t i;
5811 register Py_ssize_t j;
5812 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005814 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815
5816 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005817 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 if (maxcount-- <= 0)
5819 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005820 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 i = j = i + 1;
5822 } else
5823 i++;
5824 }
5825 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005826 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 }
5828 return list;
5829
5830 onError:
5831 Py_DECREF(list);
5832 return NULL;
5833}
5834
Tim Petersced69f82003-09-16 20:30:58 +00005835static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836PyObject *split_substring(PyUnicodeObject *self,
5837 PyObject *list,
5838 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005839 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005841 register Py_ssize_t i;
5842 register Py_ssize_t j;
5843 Py_ssize_t len = self->length;
5844 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 PyObject *str;
5846
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005847 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 if (Py_UNICODE_MATCH(self, i, substring)) {
5849 if (maxcount-- <= 0)
5850 break;
5851 SPLIT_APPEND(self->str, j, i);
5852 i = j = i + sublen;
5853 } else
5854 i++;
5855 }
5856 if (j <= len) {
5857 SPLIT_APPEND(self->str, j, len);
5858 }
5859 return list;
5860
5861 onError:
5862 Py_DECREF(list);
5863 return NULL;
5864}
5865
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005866static
5867PyObject *rsplit_whitespace(PyUnicodeObject *self,
5868 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005869 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005870{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005871 register Py_ssize_t i;
5872 register Py_ssize_t j;
5873 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005874 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005875 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005876
5877 for (i = j = len - 1; i >= 0; ) {
5878 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005879 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005880 i--;
5881 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005882 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005883 i--;
5884 if (j > i) {
5885 if (maxcount-- <= 0)
5886 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005887 SPLIT_APPEND(buf, i + 1, j + 1);
5888 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005889 i--;
5890 j = i;
5891 }
5892 }
5893 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005894 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005895 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005896 if (PyList_Reverse(list) < 0)
5897 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005898 return list;
5899
5900 onError:
5901 Py_DECREF(list);
5902 return NULL;
5903}
5904
5905static
5906PyObject *rsplit_char(PyUnicodeObject *self,
5907 PyObject *list,
5908 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005909 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005910{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005911 register Py_ssize_t i;
5912 register Py_ssize_t j;
5913 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005914 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005915 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005916
5917 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005918 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005919 if (maxcount-- <= 0)
5920 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005921 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005922 j = i = i - 1;
5923 } else
5924 i--;
5925 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005926 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005927 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005928 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005929 if (PyList_Reverse(list) < 0)
5930 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005931 return list;
5932
5933 onError:
5934 Py_DECREF(list);
5935 return NULL;
5936}
5937
5938static
5939PyObject *rsplit_substring(PyUnicodeObject *self,
5940 PyObject *list,
5941 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005942 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005943{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005944 register Py_ssize_t i;
5945 register Py_ssize_t j;
5946 Py_ssize_t len = self->length;
5947 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005948 PyObject *str;
5949
5950 for (i = len - sublen, j = len; i >= 0; ) {
5951 if (Py_UNICODE_MATCH(self, i, substring)) {
5952 if (maxcount-- <= 0)
5953 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005955 j = i;
5956 i -= sublen;
5957 } else
5958 i--;
5959 }
5960 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005961 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005962 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005963 if (PyList_Reverse(list) < 0)
5964 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005965 return list;
5966
5967 onError:
5968 Py_DECREF(list);
5969 return NULL;
5970}
5971
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972#undef SPLIT_APPEND
5973
5974static
5975PyObject *split(PyUnicodeObject *self,
5976 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005977 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978{
5979 PyObject *list;
5980
5981 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005982 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983
5984 list = PyList_New(0);
5985 if (!list)
5986 return NULL;
5987
5988 if (substring == NULL)
5989 return split_whitespace(self,list,maxcount);
5990
5991 else if (substring->length == 1)
5992 return split_char(self,list,substring->str[0],maxcount);
5993
5994 else if (substring->length == 0) {
5995 Py_DECREF(list);
5996 PyErr_SetString(PyExc_ValueError, "empty separator");
5997 return NULL;
5998 }
5999 else
6000 return split_substring(self,list,substring,maxcount);
6001}
6002
Tim Petersced69f82003-09-16 20:30:58 +00006003static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006004PyObject *rsplit(PyUnicodeObject *self,
6005 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006006 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006007{
6008 PyObject *list;
6009
6010 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006011 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006012
6013 list = PyList_New(0);
6014 if (!list)
6015 return NULL;
6016
6017 if (substring == NULL)
6018 return rsplit_whitespace(self,list,maxcount);
6019
6020 else if (substring->length == 1)
6021 return rsplit_char(self,list,substring->str[0],maxcount);
6022
6023 else if (substring->length == 0) {
6024 Py_DECREF(list);
6025 PyErr_SetString(PyExc_ValueError, "empty separator");
6026 return NULL;
6027 }
6028 else
6029 return rsplit_substring(self,list,substring,maxcount);
6030}
6031
6032static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033PyObject *replace(PyUnicodeObject *self,
6034 PyUnicodeObject *str1,
6035 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006036 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037{
6038 PyUnicodeObject *u;
6039
6040 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006041 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
Thomas Wouters477c8d52006-05-27 19:21:47 +00006043 if (str1->length == str2->length) {
6044 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006045 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006046 if (str1->length == 1) {
6047 /* replace characters */
6048 Py_UNICODE u1, u2;
6049 if (!findchar(self->str, self->length, str1->str[0]))
6050 goto nothing;
6051 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6052 if (!u)
6053 return NULL;
6054 Py_UNICODE_COPY(u->str, self->str, self->length);
6055 u1 = str1->str[0];
6056 u2 = str2->str[0];
6057 for (i = 0; i < u->length; i++)
6058 if (u->str[i] == u1) {
6059 if (--maxcount < 0)
6060 break;
6061 u->str[i] = u2;
6062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006064 i = fastsearch(
6065 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006067 if (i < 0)
6068 goto nothing;
6069 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6070 if (!u)
6071 return NULL;
6072 Py_UNICODE_COPY(u->str, self->str, self->length);
6073 while (i <= self->length - str1->length)
6074 if (Py_UNICODE_MATCH(self, i, str1)) {
6075 if (--maxcount < 0)
6076 break;
6077 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6078 i += str1->length;
6079 } else
6080 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006083
6084 Py_ssize_t n, i, j, e;
6085 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 Py_UNICODE *p;
6087
6088 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006089 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 if (n > maxcount)
6091 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006092 if (n == 0)
6093 goto nothing;
6094 /* new_size = self->length + n * (str2->length - str1->length)); */
6095 delta = (str2->length - str1->length);
6096 if (delta == 0) {
6097 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006099 product = n * (str2->length - str1->length);
6100 if ((product / (str2->length - str1->length)) != n) {
6101 PyErr_SetString(PyExc_OverflowError,
6102 "replace string is too long");
6103 return NULL;
6104 }
6105 new_size = self->length + product;
6106 if (new_size < 0) {
6107 PyErr_SetString(PyExc_OverflowError,
6108 "replace string is too long");
6109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 }
6111 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006112 u = _PyUnicode_New(new_size);
6113 if (!u)
6114 return NULL;
6115 i = 0;
6116 p = u->str;
6117 e = self->length - str1->length;
6118 if (str1->length > 0) {
6119 while (n-- > 0) {
6120 /* look for next match */
6121 j = i;
6122 while (j <= e) {
6123 if (Py_UNICODE_MATCH(self, j, str1))
6124 break;
6125 j++;
6126 }
6127 if (j > i) {
6128 if (j > e)
6129 break;
6130 /* copy unchanged part [i:j] */
6131 Py_UNICODE_COPY(p, self->str+i, j-i);
6132 p += j - i;
6133 }
6134 /* copy substitution string */
6135 if (str2->length > 0) {
6136 Py_UNICODE_COPY(p, str2->str, str2->length);
6137 p += str2->length;
6138 }
6139 i = j + str1->length;
6140 }
6141 if (i < self->length)
6142 /* copy tail [i:] */
6143 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6144 } else {
6145 /* interleave */
6146 while (n > 0) {
6147 Py_UNICODE_COPY(p, str2->str, str2->length);
6148 p += str2->length;
6149 if (--n <= 0)
6150 break;
6151 *p++ = self->str[i++];
6152 }
6153 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006157
6158nothing:
6159 /* nothing to replace; return original string (when possible) */
6160 if (PyUnicode_CheckExact(self)) {
6161 Py_INCREF(self);
6162 return (PyObject *) self;
6163 }
6164 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165}
6166
6167/* --- Unicode Object Methods --------------------------------------------- */
6168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006169PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170"S.title() -> unicode\n\
6171\n\
6172Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006173characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
6175static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006176unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 return fixup(self, fixtitle);
6179}
6180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006181PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182"S.capitalize() -> unicode\n\
6183\n\
6184Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006185have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186
6187static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006188unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 return fixup(self, fixcapitalize);
6191}
6192
6193#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006194PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195"S.capwords() -> unicode\n\
6196\n\
6197Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006198normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
6200static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006201unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202{
6203 PyObject *list;
6204 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006205 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 /* Split into words */
6208 list = split(self, NULL, -1);
6209 if (!list)
6210 return NULL;
6211
6212 /* Capitalize each word */
6213 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6214 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6215 fixcapitalize);
6216 if (item == NULL)
6217 goto onError;
6218 Py_DECREF(PyList_GET_ITEM(list, i));
6219 PyList_SET_ITEM(list, i, item);
6220 }
6221
6222 /* Join the words to form a new string */
6223 item = PyUnicode_Join(NULL, list);
6224
6225onError:
6226 Py_DECREF(list);
6227 return (PyObject *)item;
6228}
6229#endif
6230
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006231/* Argument converter. Coerces to a single unicode character */
6232
6233static int
6234convert_uc(PyObject *obj, void *addr)
6235{
6236 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6237 PyObject *uniobj;
6238 Py_UNICODE *unistr;
6239
6240 uniobj = PyUnicode_FromObject(obj);
6241 if (uniobj == NULL) {
6242 PyErr_SetString(PyExc_TypeError,
6243 "The fill character cannot be converted to Unicode");
6244 return 0;
6245 }
6246 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6247 PyErr_SetString(PyExc_TypeError,
6248 "The fill character must be exactly one character long");
6249 Py_DECREF(uniobj);
6250 return 0;
6251 }
6252 unistr = PyUnicode_AS_UNICODE(uniobj);
6253 *fillcharloc = unistr[0];
6254 Py_DECREF(uniobj);
6255 return 1;
6256}
6257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006258PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006259"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006261Return S centered in a Unicode string of length width. Padding is\n\
6262done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263
6264static PyObject *
6265unicode_center(PyUnicodeObject *self, PyObject *args)
6266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006267 Py_ssize_t marg, left;
6268 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006269 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270
Thomas Woutersde017742006-02-16 19:34:37 +00006271 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 return NULL;
6273
Tim Peters7a29bd52001-09-12 03:03:31 +00006274 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 Py_INCREF(self);
6276 return (PyObject*) self;
6277 }
6278
6279 marg = width - self->length;
6280 left = marg / 2 + (marg & width & 1);
6281
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006282 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283}
6284
Marc-André Lemburge5034372000-08-08 08:04:29 +00006285#if 0
6286
6287/* This code should go into some future Unicode collation support
6288 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006289 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006290
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006291/* speedy UTF-16 code point order comparison */
6292/* gleaned from: */
6293/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6294
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006295static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006296{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006297 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006298 0, 0, 0, 0, 0, 0, 0, 0,
6299 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006300 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006301};
6302
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303static int
6304unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6305{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006306 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006307
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 Py_UNICODE *s1 = str1->str;
6309 Py_UNICODE *s2 = str2->str;
6310
6311 len1 = str1->length;
6312 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006313
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006315 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006316
6317 c1 = *s1++;
6318 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006319
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006320 if (c1 > (1<<11) * 26)
6321 c1 += utf16Fixup[c1>>11];
6322 if (c2 > (1<<11) * 26)
6323 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006324 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006325
6326 if (c1 != c2)
6327 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006328
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006329 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 }
6331
6332 return (len1 < len2) ? -1 : (len1 != len2);
6333}
6334
Marc-André Lemburge5034372000-08-08 08:04:29 +00006335#else
6336
6337static int
6338unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6339{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006340 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006341
6342 Py_UNICODE *s1 = str1->str;
6343 Py_UNICODE *s2 = str2->str;
6344
6345 len1 = str1->length;
6346 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006347
Marc-André Lemburge5034372000-08-08 08:04:29 +00006348 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006349 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006350
Fredrik Lundh45714e92001-06-26 16:39:36 +00006351 c1 = *s1++;
6352 c2 = *s2++;
6353
6354 if (c1 != c2)
6355 return (c1 < c2) ? -1 : 1;
6356
Marc-André Lemburge5034372000-08-08 08:04:29 +00006357 len1--; len2--;
6358 }
6359
6360 return (len1 < len2) ? -1 : (len1 != len2);
6361}
6362
6363#endif
6364
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365int PyUnicode_Compare(PyObject *left,
6366 PyObject *right)
6367{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006368 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6369 return unicode_compare((PyUnicodeObject *)left,
6370 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006371 PyErr_Format(PyExc_TypeError,
6372 "Can't compare %.100s and %.100s",
6373 left->ob_type->tp_name,
6374 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 return -1;
6376}
6377
Martin v. Löwis5b222132007-06-10 09:51:05 +00006378int
6379PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6380{
6381 int i;
6382 Py_UNICODE *id;
6383 assert(PyUnicode_Check(uni));
6384 id = PyUnicode_AS_UNICODE(uni);
6385 /* Compare Unicode string and source character set string */
6386 for (i = 0; id[i] && str[i]; i++)
6387 if (id[i] != str[i])
6388 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6389 if (id[i])
6390 return 1; /* uni is longer */
6391 if (str[i])
6392 return -1; /* str is longer */
6393 return 0;
6394}
6395
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006396PyObject *PyUnicode_RichCompare(PyObject *left,
6397 PyObject *right,
6398 int op)
6399{
6400 int result;
6401
6402 result = PyUnicode_Compare(left, right);
6403 if (result == -1 && PyErr_Occurred())
6404 goto onError;
6405
6406 /* Convert the return value to a Boolean */
6407 switch (op) {
6408 case Py_EQ:
6409 result = (result == 0);
6410 break;
6411 case Py_NE:
6412 result = (result != 0);
6413 break;
6414 case Py_LE:
6415 result = (result <= 0);
6416 break;
6417 case Py_GE:
6418 result = (result >= 0);
6419 break;
6420 case Py_LT:
6421 result = (result == -1);
6422 break;
6423 case Py_GT:
6424 result = (result == 1);
6425 break;
6426 }
6427 return PyBool_FromLong(result);
6428
6429 onError:
6430
6431 /* Standard case
6432
6433 Type errors mean that PyUnicode_FromObject() could not convert
6434 one of the arguments (usually the right hand side) to Unicode,
6435 ie. we can't handle the comparison request. However, it is
6436 possible that the other object knows a comparison method, which
6437 is why we return Py_NotImplemented to give the other object a
6438 chance.
6439
6440 */
6441 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6442 PyErr_Clear();
6443 Py_INCREF(Py_NotImplemented);
6444 return Py_NotImplemented;
6445 }
6446 if (op != Py_EQ && op != Py_NE)
6447 return NULL;
6448
6449 /* Equality comparison.
6450
6451 This is a special case: we silence any PyExc_UnicodeDecodeError
6452 and instead turn it into a PyErr_UnicodeWarning.
6453
6454 */
6455 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6456 return NULL;
6457 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006458 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6459 (op == Py_EQ) ?
6460 "Unicode equal comparison "
6461 "failed to convert both arguments to Unicode - "
6462 "interpreting them as being unequal"
6463 :
6464 "Unicode unequal comparison "
6465 "failed to convert both arguments to Unicode - "
6466 "interpreting them as being unequal",
6467 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006468 return NULL;
6469 result = (op == Py_NE);
6470 return PyBool_FromLong(result);
6471}
6472
Guido van Rossum403d68b2000-03-13 15:55:09 +00006473int PyUnicode_Contains(PyObject *container,
6474 PyObject *element)
6475{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006476 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006477 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006478
6479 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006480 sub = PyUnicode_FromObject(element);
6481 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006482 PyErr_Format(PyExc_TypeError,
6483 "'in <string>' requires string as left operand, not %s",
6484 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006485 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006486 }
6487
Thomas Wouters477c8d52006-05-27 19:21:47 +00006488 str = PyUnicode_FromObject(container);
6489 if (!str) {
6490 Py_DECREF(sub);
6491 return -1;
6492 }
6493
6494 result = stringlib_contains_obj(str, sub);
6495
6496 Py_DECREF(str);
6497 Py_DECREF(sub);
6498
Guido van Rossum403d68b2000-03-13 15:55:09 +00006499 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006500}
6501
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502/* Concat to string or Unicode object giving a new Unicode object. */
6503
6504PyObject *PyUnicode_Concat(PyObject *left,
6505 PyObject *right)
6506{
6507 PyUnicodeObject *u = NULL, *v = NULL, *w;
6508
6509 /* Coerce the two arguments */
6510 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6511 if (u == NULL)
6512 goto onError;
6513 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6514 if (v == NULL)
6515 goto onError;
6516
6517 /* Shortcuts */
6518 if (v == unicode_empty) {
6519 Py_DECREF(v);
6520 return (PyObject *)u;
6521 }
6522 if (u == unicode_empty) {
6523 Py_DECREF(u);
6524 return (PyObject *)v;
6525 }
6526
6527 /* Concat the two Unicode strings */
6528 w = _PyUnicode_New(u->length + v->length);
6529 if (w == NULL)
6530 goto onError;
6531 Py_UNICODE_COPY(w->str, u->str, u->length);
6532 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6533
6534 Py_DECREF(u);
6535 Py_DECREF(v);
6536 return (PyObject *)w;
6537
6538onError:
6539 Py_XDECREF(u);
6540 Py_XDECREF(v);
6541 return NULL;
6542}
6543
Walter Dörwald1ab83302007-05-18 17:15:44 +00006544void
6545PyUnicode_Append(PyObject **pleft, PyObject *right)
6546{
6547 PyObject *new;
6548 if (*pleft == NULL)
6549 return;
6550 if (right == NULL || !PyUnicode_Check(*pleft)) {
6551 Py_DECREF(*pleft);
6552 *pleft = NULL;
6553 return;
6554 }
6555 new = PyUnicode_Concat(*pleft, right);
6556 Py_DECREF(*pleft);
6557 *pleft = new;
6558}
6559
6560void
6561PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6562{
6563 PyUnicode_Append(pleft, right);
6564 Py_XDECREF(right);
6565}
6566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568"S.count(sub[, start[, end]]) -> int\n\
6569\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006570Return the number of non-overlapping occurrences of substring sub in\n\
6571Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006572interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573
6574static PyObject *
6575unicode_count(PyUnicodeObject *self, PyObject *args)
6576{
6577 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006578 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006579 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 PyObject *result;
6581
Guido van Rossumb8872e62000-05-09 14:14:27 +00006582 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6583 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 return NULL;
6585
6586 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006587 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 if (substring == NULL)
6589 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006590
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
Christian Heimes217cfd12007-12-02 14:31:20 +00006593 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006594 stringlib_count(self->str + start, end - start,
6595 substring->str, substring->length)
6596 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
6598 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 return result;
6601}
6602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006603PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006604"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006606Encodes S using the codec registered for encoding. encoding defaults\n\
6607to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006608handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006609a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6610'xmlcharrefreplace' as well as any other name registered with\n\
6611codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
6613static PyObject *
6614unicode_encode(PyUnicodeObject *self, PyObject *args)
6615{
6616 char *encoding = NULL;
6617 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006618 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6621 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006622 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006623 if (v == NULL)
6624 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006625 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006626 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006627 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006628 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006629 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006630 Py_DECREF(v);
6631 return NULL;
6632 }
6633 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006634
6635 onError:
6636 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006637}
6638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006639PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640"S.expandtabs([tabsize]) -> unicode\n\
6641\n\
6642Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006643If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644
6645static PyObject*
6646unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6647{
6648 Py_UNICODE *e;
6649 Py_UNICODE *p;
6650 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006651 Py_UNICODE *qe;
6652 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 PyUnicodeObject *u;
6654 int tabsize = 8;
6655
6656 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6657 return NULL;
6658
Thomas Wouters7e474022000-07-16 12:04:32 +00006659 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006660 i = 0; /* chars up to and including most recent \n or \r */
6661 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6662 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 for (p = self->str; p < e; p++)
6664 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006665 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006666 incr = tabsize - (j % tabsize); /* cannot overflow */
6667 if (j > PY_SSIZE_T_MAX - incr)
6668 goto overflow1;
6669 j += incr;
6670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 }
6672 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006673 if (j > PY_SSIZE_T_MAX - 1)
6674 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 j++;
6676 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006677 if (i > PY_SSIZE_T_MAX - j)
6678 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006680 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 }
6682 }
6683
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006684 if (i > PY_SSIZE_T_MAX - j)
6685 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006686
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 /* Second pass: create output string and fill it */
6688 u = _PyUnicode_New(i + j);
6689 if (!u)
6690 return NULL;
6691
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006692 j = 0; /* same as in first pass */
6693 q = u->str; /* next output char */
6694 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695
6696 for (p = self->str; p < e; p++)
6697 if (*p == '\t') {
6698 if (tabsize > 0) {
6699 i = tabsize - (j % tabsize);
6700 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006701 while (i--) {
6702 if (q >= qe)
6703 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 }
6707 }
6708 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006709 if (q >= qe)
6710 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006712 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 if (*p == '\n' || *p == '\r')
6714 j = 0;
6715 }
6716
6717 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006718
6719 overflow2:
6720 Py_DECREF(u);
6721 overflow1:
6722 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724}
6725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006726PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727"S.find(sub [,start [,end]]) -> int\n\
6728\n\
6729Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006730such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731arguments start and end are interpreted as in slice notation.\n\
6732\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
6735static PyObject *
6736unicode_find(PyUnicodeObject *self, PyObject *args)
6737{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006738 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006739 Py_ssize_t start;
6740 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006741 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742
Christian Heimes9cd17752007-11-18 19:35:23 +00006743 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
Thomas Wouters477c8d52006-05-27 19:21:47 +00006746 result = stringlib_find_slice(
6747 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6748 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6749 start, end
6750 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751
6752 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006753
Christian Heimes217cfd12007-12-02 14:31:20 +00006754 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755}
6756
6757static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006758unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759{
6760 if (index < 0 || index >= self->length) {
6761 PyErr_SetString(PyExc_IndexError, "string index out of range");
6762 return NULL;
6763 }
6764
6765 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6766}
6767
Guido van Rossumc2504932007-09-18 19:42:40 +00006768/* Believe it or not, this produces the same value for ASCII strings
6769 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006771unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
Guido van Rossumc2504932007-09-18 19:42:40 +00006773 Py_ssize_t len;
6774 Py_UNICODE *p;
6775 long x;
6776
6777 if (self->hash != -1)
6778 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006779 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006780 p = self->str;
6781 x = *p << 7;
6782 while (--len >= 0)
6783 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006784 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006785 if (x == -1)
6786 x = -2;
6787 self->hash = x;
6788 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006791PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792"S.index(sub [,start [,end]]) -> int\n\
6793\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006794Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795
6796static PyObject *
6797unicode_index(PyUnicodeObject *self, PyObject *args)
6798{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006799 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006800 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006801 Py_ssize_t start;
6802 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
Christian Heimes9cd17752007-11-18 19:35:23 +00006804 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806
Thomas Wouters477c8d52006-05-27 19:21:47 +00006807 result = stringlib_find_slice(
6808 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6809 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6810 start, end
6811 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812
6813 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006814
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 if (result < 0) {
6816 PyErr_SetString(PyExc_ValueError, "substring not found");
6817 return NULL;
6818 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006819
Christian Heimes217cfd12007-12-02 14:31:20 +00006820 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821}
6822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006823PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006824"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828
6829static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006830unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831{
6832 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6833 register const Py_UNICODE *e;
6834 int cased;
6835
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 /* Shortcut for single character strings */
6837 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006838 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006840 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006841 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006843
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 e = p + PyUnicode_GET_SIZE(self);
6845 cased = 0;
6846 for (; p < e; p++) {
6847 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006848
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006850 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 else if (!cased && Py_UNICODE_ISLOWER(ch))
6852 cased = 1;
6853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006858"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006860Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006864unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865{
6866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6867 register const Py_UNICODE *e;
6868 int cased;
6869
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 /* Shortcut for single character strings */
6871 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006874 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006875 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 e = p + PyUnicode_GET_SIZE(self);
6879 cased = 0;
6880 for (; p < e; p++) {
6881 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006882
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006884 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 else if (!cased && Py_UNICODE_ISUPPER(ch))
6886 cased = 1;
6887 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006888 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889}
6890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006891PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006894Return True if S is a titlecased string and there is at least one\n\
6895character in S, i.e. upper- and titlecase characters may only\n\
6896follow uncased characters and lowercase characters only cased ones.\n\
6897Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898
6899static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006900unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901{
6902 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6903 register const Py_UNICODE *e;
6904 int cased, previous_is_cased;
6905
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 /* Shortcut for single character strings */
6907 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006908 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6909 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006911 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006912 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006913 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006914
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 e = p + PyUnicode_GET_SIZE(self);
6916 cased = 0;
6917 previous_is_cased = 0;
6918 for (; p < e; p++) {
6919 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006920
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6922 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006923 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 previous_is_cased = 1;
6925 cased = 1;
6926 }
6927 else if (Py_UNICODE_ISLOWER(ch)) {
6928 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006929 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 previous_is_cased = 1;
6931 cased = 1;
6932 }
6933 else
6934 previous_is_cased = 0;
6935 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937}
6938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006939PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006940"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006942Return True if all characters in S are whitespace\n\
6943and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944
6945static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006946unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947{
6948 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6949 register const Py_UNICODE *e;
6950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 /* Shortcut for single character strings */
6952 if (PyUnicode_GET_SIZE(self) == 1 &&
6953 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006954 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006956 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006957 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006958 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006959
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 e = p + PyUnicode_GET_SIZE(self);
6961 for (; p < e; p++) {
6962 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006965 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966}
6967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006968PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006969"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006970\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006971Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006972and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006973
6974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006975unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006976{
6977 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6978 register const Py_UNICODE *e;
6979
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006980 /* Shortcut for single character strings */
6981 if (PyUnicode_GET_SIZE(self) == 1 &&
6982 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006983 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006984
6985 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006986 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006987 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006988
6989 e = p + PyUnicode_GET_SIZE(self);
6990 for (; p < e; p++) {
6991 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006992 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006993 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006994 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006995}
6996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006997PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006998"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006999\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007000Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007001and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007002
7003static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007004unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007005{
7006 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7007 register const Py_UNICODE *e;
7008
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007009 /* Shortcut for single character strings */
7010 if (PyUnicode_GET_SIZE(self) == 1 &&
7011 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007012 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007013
7014 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007015 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007016 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007017
7018 e = p + PyUnicode_GET_SIZE(self);
7019 for (; p < e; p++) {
7020 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007021 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007022 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007023 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007024}
7025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007026PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007027"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007029Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007030False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
7032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007033unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034{
7035 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7036 register const Py_UNICODE *e;
7037
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 /* Shortcut for single character strings */
7039 if (PyUnicode_GET_SIZE(self) == 1 &&
7040 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007041 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007043 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007044 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007045 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007046
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 e = p + PyUnicode_GET_SIZE(self);
7048 for (; p < e; p++) {
7049 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007050 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007052 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053}
7054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007055PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007056"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007058Return True if all characters in S are digits\n\
7059and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060
7061static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007062unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063{
7064 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7065 register const Py_UNICODE *e;
7066
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 /* Shortcut for single character strings */
7068 if (PyUnicode_GET_SIZE(self) == 1 &&
7069 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007070 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007072 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007073 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007074 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007075
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 e = p + PyUnicode_GET_SIZE(self);
7077 for (; p < e; p++) {
7078 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007079 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007081 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082}
7083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007084PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007085"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007087Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007088False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089
7090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007091unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092{
7093 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7094 register const Py_UNICODE *e;
7095
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 /* Shortcut for single character strings */
7097 if (PyUnicode_GET_SIZE(self) == 1 &&
7098 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007099 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007101 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007102 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007103 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007104
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105 e = p + PyUnicode_GET_SIZE(self);
7106 for (; p < e; p++) {
7107 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007108 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007110 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111}
7112
Martin v. Löwis47383402007-08-15 07:32:56 +00007113int
7114PyUnicode_IsIdentifier(PyObject *self)
7115{
7116 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7117 register const Py_UNICODE *e;
7118
7119 /* Special case for empty strings */
7120 if (PyUnicode_GET_SIZE(self) == 0)
7121 return 0;
7122
7123 /* PEP 3131 says that the first character must be in
7124 XID_Start and subsequent characters in XID_Continue,
7125 and for the ASCII range, the 2.x rules apply (i.e
7126 start with letters and underscore, continue with
7127 letters, digits, underscore). However, given the current
7128 definition of XID_Start and XID_Continue, it is sufficient
7129 to check just for these, except that _ must be allowed
7130 as starting an identifier. */
7131 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7132 return 0;
7133
7134 e = p + PyUnicode_GET_SIZE(self);
7135 for (p++; p < e; p++) {
7136 if (!_PyUnicode_IsXidContinue(*p))
7137 return 0;
7138 }
7139 return 1;
7140}
7141
7142PyDoc_STRVAR(isidentifier__doc__,
7143"S.isidentifier() -> bool\n\
7144\n\
7145Return True if S is a valid identifier according\n\
7146to the language definition.");
7147
7148static PyObject*
7149unicode_isidentifier(PyObject *self)
7150{
7151 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7152}
7153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007154PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155"S.join(sequence) -> unicode\n\
7156\n\
7157Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007158sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159
7160static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007161unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007163 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164}
7165
Martin v. Löwis18e16552006-02-15 17:27:45 +00007166static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167unicode_length(PyUnicodeObject *self)
7168{
7169 return self->length;
7170}
7171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007172PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007173"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174\n\
7175Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007176done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177
7178static PyObject *
7179unicode_ljust(PyUnicodeObject *self, PyObject *args)
7180{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007181 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007182 Py_UNICODE fillchar = ' ';
7183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007184 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 return NULL;
7186
Tim Peters7a29bd52001-09-12 03:03:31 +00007187 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 Py_INCREF(self);
7189 return (PyObject*) self;
7190 }
7191
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007192 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193}
7194
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007195PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196"S.lower() -> unicode\n\
7197\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007198Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199
7200static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007201unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 return fixup(self, fixlower);
7204}
7205
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007206#define LEFTSTRIP 0
7207#define RIGHTSTRIP 1
7208#define BOTHSTRIP 2
7209
7210/* Arrays indexed by above */
7211static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7212
7213#define STRIPNAME(i) (stripformat[i]+3)
7214
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007215/* externally visible for str.strip(unicode) */
7216PyObject *
7217_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7218{
7219 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007220 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007221 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007222 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7223 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007224
Thomas Wouters477c8d52006-05-27 19:21:47 +00007225 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7226
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007227 i = 0;
7228 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007229 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7230 i++;
7231 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007232 }
7233
7234 j = len;
7235 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007236 do {
7237 j--;
7238 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7239 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007240 }
7241
7242 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007243 Py_INCREF(self);
7244 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007245 }
7246 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007247 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007248}
7249
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250
7251static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007252do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007254 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007256
7257 i = 0;
7258 if (striptype != RIGHTSTRIP) {
7259 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7260 i++;
7261 }
7262 }
7263
7264 j = len;
7265 if (striptype != LEFTSTRIP) {
7266 do {
7267 j--;
7268 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7269 j++;
7270 }
7271
7272 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7273 Py_INCREF(self);
7274 return (PyObject*)self;
7275 }
7276 else
7277 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278}
7279
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007280
7281static PyObject *
7282do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7283{
7284 PyObject *sep = NULL;
7285
7286 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7287 return NULL;
7288
7289 if (sep != NULL && sep != Py_None) {
7290 if (PyUnicode_Check(sep))
7291 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007292 else {
7293 PyErr_Format(PyExc_TypeError,
7294 "%s arg must be None, unicode or str",
7295 STRIPNAME(striptype));
7296 return NULL;
7297 }
7298 }
7299
7300 return do_strip(self, striptype);
7301}
7302
7303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007304PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007305"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007306\n\
7307Return a copy of the string S with leading and trailing\n\
7308whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007309If chars is given and not None, remove characters in chars instead.\n\
7310If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007311
7312static PyObject *
7313unicode_strip(PyUnicodeObject *self, PyObject *args)
7314{
7315 if (PyTuple_GET_SIZE(args) == 0)
7316 return do_strip(self, BOTHSTRIP); /* Common case */
7317 else
7318 return do_argstrip(self, BOTHSTRIP, args);
7319}
7320
7321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007322PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007323"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007324\n\
7325Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007326If chars is given and not None, remove characters in chars instead.\n\
7327If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007328
7329static PyObject *
7330unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7331{
7332 if (PyTuple_GET_SIZE(args) == 0)
7333 return do_strip(self, LEFTSTRIP); /* Common case */
7334 else
7335 return do_argstrip(self, LEFTSTRIP, args);
7336}
7337
7338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007339PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007340"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007341\n\
7342Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007343If chars is given and not None, remove characters in chars instead.\n\
7344If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007345
7346static PyObject *
7347unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7348{
7349 if (PyTuple_GET_SIZE(args) == 0)
7350 return do_strip(self, RIGHTSTRIP); /* Common case */
7351 else
7352 return do_argstrip(self, RIGHTSTRIP, args);
7353}
7354
7355
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007357unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358{
7359 PyUnicodeObject *u;
7360 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007361 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007362 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363
7364 if (len < 0)
7365 len = 0;
7366
Tim Peters7a29bd52001-09-12 03:03:31 +00007367 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 /* no repeat, return original string */
7369 Py_INCREF(str);
7370 return (PyObject*) str;
7371 }
Tim Peters8f422462000-09-09 06:13:41 +00007372
7373 /* ensure # of chars needed doesn't overflow int and # of bytes
7374 * needed doesn't overflow size_t
7375 */
7376 nchars = len * str->length;
7377 if (len && nchars / len != str->length) {
7378 PyErr_SetString(PyExc_OverflowError,
7379 "repeated string is too long");
7380 return NULL;
7381 }
7382 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7383 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7384 PyErr_SetString(PyExc_OverflowError,
7385 "repeated string is too long");
7386 return NULL;
7387 }
7388 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 if (!u)
7390 return NULL;
7391
7392 p = u->str;
7393
Thomas Wouters477c8d52006-05-27 19:21:47 +00007394 if (str->length == 1 && len > 0) {
7395 Py_UNICODE_FILL(p, str->str[0], len);
7396 } else {
7397 Py_ssize_t done = 0; /* number of characters copied this far */
7398 if (done < nchars) {
7399 Py_UNICODE_COPY(p, str->str, str->length);
7400 done = str->length;
7401 }
7402 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007403 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007404 Py_UNICODE_COPY(p+done, p, n);
7405 done += n;
7406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 }
7408
7409 return (PyObject*) u;
7410}
7411
7412PyObject *PyUnicode_Replace(PyObject *obj,
7413 PyObject *subobj,
7414 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007415 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416{
7417 PyObject *self;
7418 PyObject *str1;
7419 PyObject *str2;
7420 PyObject *result;
7421
7422 self = PyUnicode_FromObject(obj);
7423 if (self == NULL)
7424 return NULL;
7425 str1 = PyUnicode_FromObject(subobj);
7426 if (str1 == NULL) {
7427 Py_DECREF(self);
7428 return NULL;
7429 }
7430 str2 = PyUnicode_FromObject(replobj);
7431 if (str2 == NULL) {
7432 Py_DECREF(self);
7433 Py_DECREF(str1);
7434 return NULL;
7435 }
Tim Petersced69f82003-09-16 20:30:58 +00007436 result = replace((PyUnicodeObject *)self,
7437 (PyUnicodeObject *)str1,
7438 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 maxcount);
7440 Py_DECREF(self);
7441 Py_DECREF(str1);
7442 Py_DECREF(str2);
7443 return result;
7444}
7445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007446PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447"S.replace (old, new[, maxsplit]) -> unicode\n\
7448\n\
7449Return a copy of S with all occurrences of substring\n\
7450old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007451given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452
7453static PyObject*
7454unicode_replace(PyUnicodeObject *self, PyObject *args)
7455{
7456 PyUnicodeObject *str1;
7457 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007458 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 PyObject *result;
7460
Martin v. Löwis18e16552006-02-15 17:27:45 +00007461 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 return NULL;
7463 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7464 if (str1 == NULL)
7465 return NULL;
7466 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007467 if (str2 == NULL) {
7468 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
7472 result = replace(self, str1, str2, maxcount);
7473
7474 Py_DECREF(str1);
7475 Py_DECREF(str2);
7476 return result;
7477}
7478
7479static
7480PyObject *unicode_repr(PyObject *unicode)
7481{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007482 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007483 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007484 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7485 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7486
7487 /* XXX(nnorwitz): rather than over-allocating, it would be
7488 better to choose a different scheme. Perhaps scan the
7489 first N-chars of the string and allocate based on that size.
7490 */
7491 /* Initial allocation is based on the longest-possible unichr
7492 escape.
7493
7494 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7495 unichr, so in this case it's the longest unichr escape. In
7496 narrow (UTF-16) builds this is five chars per source unichr
7497 since there are two unichrs in the surrogate pair, so in narrow
7498 (UTF-16) builds it's not the longest unichr escape.
7499
7500 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7501 so in the narrow (UTF-16) build case it's the longest unichr
7502 escape.
7503 */
7504
Walter Dörwald1ab83302007-05-18 17:15:44 +00007505 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007506 2 /* quotes */
7507#ifdef Py_UNICODE_WIDE
7508 + 10*size
7509#else
7510 + 6*size
7511#endif
7512 + 1);
7513 if (repr == NULL)
7514 return NULL;
7515
Walter Dörwald1ab83302007-05-18 17:15:44 +00007516 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007517
7518 /* Add quote */
7519 *p++ = (findchar(s, size, '\'') &&
7520 !findchar(s, size, '"')) ? '"' : '\'';
7521 while (size-- > 0) {
7522 Py_UNICODE ch = *s++;
7523
7524 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007525 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007526 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007527 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007528 continue;
7529 }
7530
7531#ifdef Py_UNICODE_WIDE
7532 /* Map 21-bit characters to '\U00xxxxxx' */
7533 else if (ch >= 0x10000) {
7534 *p++ = '\\';
7535 *p++ = 'U';
7536 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7537 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7538 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7539 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7540 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7541 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7542 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7543 *p++ = hexdigits[ch & 0x0000000F];
7544 continue;
7545 }
7546#else
7547 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7548 else if (ch >= 0xD800 && ch < 0xDC00) {
7549 Py_UNICODE ch2;
7550 Py_UCS4 ucs;
7551
7552 ch2 = *s++;
7553 size--;
7554 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7555 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7556 *p++ = '\\';
7557 *p++ = 'U';
7558 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7559 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7560 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7561 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7562 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7563 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7564 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7565 *p++ = hexdigits[ucs & 0x0000000F];
7566 continue;
7567 }
7568 /* Fall through: isolated surrogates are copied as-is */
7569 s--;
7570 size++;
7571 }
7572#endif
7573
7574 /* Map 16-bit characters to '\uxxxx' */
7575 if (ch >= 256) {
7576 *p++ = '\\';
7577 *p++ = 'u';
7578 *p++ = hexdigits[(ch >> 12) & 0x000F];
7579 *p++ = hexdigits[(ch >> 8) & 0x000F];
7580 *p++ = hexdigits[(ch >> 4) & 0x000F];
7581 *p++ = hexdigits[ch & 0x000F];
7582 }
7583
7584 /* Map special whitespace to '\t', \n', '\r' */
7585 else if (ch == '\t') {
7586 *p++ = '\\';
7587 *p++ = 't';
7588 }
7589 else if (ch == '\n') {
7590 *p++ = '\\';
7591 *p++ = 'n';
7592 }
7593 else if (ch == '\r') {
7594 *p++ = '\\';
7595 *p++ = 'r';
7596 }
7597
7598 /* Map non-printable US ASCII to '\xhh' */
7599 else if (ch < ' ' || ch >= 0x7F) {
7600 *p++ = '\\';
7601 *p++ = 'x';
7602 *p++ = hexdigits[(ch >> 4) & 0x000F];
7603 *p++ = hexdigits[ch & 0x000F];
7604 }
7605
7606 /* Copy everything else as-is */
7607 else
7608 *p++ = (char) ch;
7609 }
7610 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007611 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007612
7613 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007614 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007615 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616}
7617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007618PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619"S.rfind(sub [,start [,end]]) -> int\n\
7620\n\
7621Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007622such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623arguments start and end are interpreted as in slice notation.\n\
7624\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007625Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
7627static PyObject *
7628unicode_rfind(PyUnicodeObject *self, PyObject *args)
7629{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007630 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007631 Py_ssize_t start;
7632 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007633 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634
Christian Heimes9cd17752007-11-18 19:35:23 +00007635 if (!_ParseTupleFinds(args, &substring, &start, &end))
7636 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637
Thomas Wouters477c8d52006-05-27 19:21:47 +00007638 result = stringlib_rfind_slice(
7639 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7640 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7641 start, end
7642 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643
7644 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007645
Christian Heimes217cfd12007-12-02 14:31:20 +00007646 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647}
7648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650"S.rindex(sub [,start [,end]]) -> int\n\
7651\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007652Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653
7654static PyObject *
7655unicode_rindex(PyUnicodeObject *self, PyObject *args)
7656{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007657 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007658 Py_ssize_t start;
7659 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007660 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661
Christian Heimes9cd17752007-11-18 19:35:23 +00007662 if (!_ParseTupleFinds(args, &substring, &start, &end))
7663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664
Thomas Wouters477c8d52006-05-27 19:21:47 +00007665 result = stringlib_rfind_slice(
7666 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7667 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7668 start, end
7669 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670
7671 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007672
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 if (result < 0) {
7674 PyErr_SetString(PyExc_ValueError, "substring not found");
7675 return NULL;
7676 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007677 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678}
7679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007680PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007681"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682\n\
7683Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007684done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685
7686static PyObject *
7687unicode_rjust(PyUnicodeObject *self, PyObject *args)
7688{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007689 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007690 Py_UNICODE fillchar = ' ';
7691
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007692 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 return NULL;
7694
Tim Peters7a29bd52001-09-12 03:03:31 +00007695 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 Py_INCREF(self);
7697 return (PyObject*) self;
7698 }
7699
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007700 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701}
7702
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703PyObject *PyUnicode_Split(PyObject *s,
7704 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007705 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706{
7707 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007708
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709 s = PyUnicode_FromObject(s);
7710 if (s == NULL)
7711 return NULL;
7712 if (sep != NULL) {
7713 sep = PyUnicode_FromObject(sep);
7714 if (sep == NULL) {
7715 Py_DECREF(s);
7716 return NULL;
7717 }
7718 }
7719
7720 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7721
7722 Py_DECREF(s);
7723 Py_XDECREF(sep);
7724 return result;
7725}
7726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007727PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728"S.split([sep [,maxsplit]]) -> list of strings\n\
7729\n\
7730Return a list of the words in S, using sep as the\n\
7731delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007732splits are done. If sep is not specified or is None, any\n\
7733whitespace string is a separator and leading and trailing\n\
7734whitespace is stripped before splitting.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735
7736static PyObject*
7737unicode_split(PyUnicodeObject *self, PyObject *args)
7738{
7739 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007740 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741
Martin v. Löwis18e16552006-02-15 17:27:45 +00007742 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 return NULL;
7744
7745 if (substring == Py_None)
7746 return split(self, NULL, maxcount);
7747 else if (PyUnicode_Check(substring))
7748 return split(self, (PyUnicodeObject *)substring, maxcount);
7749 else
7750 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7751}
7752
Thomas Wouters477c8d52006-05-27 19:21:47 +00007753PyObject *
7754PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7755{
7756 PyObject* str_obj;
7757 PyObject* sep_obj;
7758 PyObject* out;
7759
7760 str_obj = PyUnicode_FromObject(str_in);
7761 if (!str_obj)
7762 return NULL;
7763 sep_obj = PyUnicode_FromObject(sep_in);
7764 if (!sep_obj) {
7765 Py_DECREF(str_obj);
7766 return NULL;
7767 }
7768
7769 out = stringlib_partition(
7770 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7771 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7772 );
7773
7774 Py_DECREF(sep_obj);
7775 Py_DECREF(str_obj);
7776
7777 return out;
7778}
7779
7780
7781PyObject *
7782PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7783{
7784 PyObject* str_obj;
7785 PyObject* sep_obj;
7786 PyObject* out;
7787
7788 str_obj = PyUnicode_FromObject(str_in);
7789 if (!str_obj)
7790 return NULL;
7791 sep_obj = PyUnicode_FromObject(sep_in);
7792 if (!sep_obj) {
7793 Py_DECREF(str_obj);
7794 return NULL;
7795 }
7796
7797 out = stringlib_rpartition(
7798 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7799 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7800 );
7801
7802 Py_DECREF(sep_obj);
7803 Py_DECREF(str_obj);
7804
7805 return out;
7806}
7807
7808PyDoc_STRVAR(partition__doc__,
7809"S.partition(sep) -> (head, sep, tail)\n\
7810\n\
7811Searches for the separator sep in S, and returns the part before it,\n\
7812the separator itself, and the part after it. If the separator is not\n\
7813found, returns S and two empty strings.");
7814
7815static PyObject*
7816unicode_partition(PyUnicodeObject *self, PyObject *separator)
7817{
7818 return PyUnicode_Partition((PyObject *)self, separator);
7819}
7820
7821PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007822"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007823\n\
7824Searches for the separator sep in S, starting at the end of S, and returns\n\
7825the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007826separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007827
7828static PyObject*
7829unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7830{
7831 return PyUnicode_RPartition((PyObject *)self, separator);
7832}
7833
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007834PyObject *PyUnicode_RSplit(PyObject *s,
7835 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007836 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007837{
7838 PyObject *result;
7839
7840 s = PyUnicode_FromObject(s);
7841 if (s == NULL)
7842 return NULL;
7843 if (sep != NULL) {
7844 sep = PyUnicode_FromObject(sep);
7845 if (sep == NULL) {
7846 Py_DECREF(s);
7847 return NULL;
7848 }
7849 }
7850
7851 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7852
7853 Py_DECREF(s);
7854 Py_XDECREF(sep);
7855 return result;
7856}
7857
7858PyDoc_STRVAR(rsplit__doc__,
7859"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7860\n\
7861Return a list of the words in S, using sep as the\n\
7862delimiter string, starting at the end of the string and\n\
7863working to the front. If maxsplit is given, at most maxsplit\n\
7864splits are done. If sep is not specified, any whitespace string\n\
7865is a separator.");
7866
7867static PyObject*
7868unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7869{
7870 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007871 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007872
Martin v. Löwis18e16552006-02-15 17:27:45 +00007873 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007874 return NULL;
7875
7876 if (substring == Py_None)
7877 return rsplit(self, NULL, maxcount);
7878 else if (PyUnicode_Check(substring))
7879 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7880 else
7881 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7882}
7883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007884PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007885"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886\n\
7887Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007888Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007889is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890
7891static PyObject*
7892unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7893{
Guido van Rossum86662912000-04-11 15:38:46 +00007894 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895
Guido van Rossum86662912000-04-11 15:38:46 +00007896 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 return NULL;
7898
Guido van Rossum86662912000-04-11 15:38:46 +00007899 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900}
7901
7902static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007903PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904{
Walter Dörwald346737f2007-05-31 10:44:43 +00007905 if (PyUnicode_CheckExact(self)) {
7906 Py_INCREF(self);
7907 return self;
7908 } else
7909 /* Subtype -- return genuine unicode string with the same value. */
7910 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7911 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912}
7913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007914PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915"S.swapcase() -> unicode\n\
7916\n\
7917Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007918and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919
7920static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007921unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 return fixup(self, fixswapcase);
7924}
7925
Georg Brandlceee0772007-11-27 23:48:05 +00007926PyDoc_STRVAR(maketrans__doc__,
7927"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
7928\n\
7929Return a translation table usable for str.translate().\n\
7930If there is only one argument, it must be a dictionary mapping Unicode\n\
7931ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
7932Character keys will then be converted to ordinals.\n\
7933If there are two arguments, they must be strings of equal length, and\n\
7934in the resulting dictionary, each character in x will be mapped to the\n\
7935character at the same position in y. If there is a third argument, it\n\
7936must be a string, whose characters will be mapped to None in the result.");
7937
7938static PyObject*
7939unicode_maketrans(PyUnicodeObject *null, PyObject *args)
7940{
7941 PyObject *x, *y = NULL, *z = NULL;
7942 PyObject *new = NULL, *key, *value;
7943 Py_ssize_t i = 0;
7944 int res;
7945
7946 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
7947 return NULL;
7948 new = PyDict_New();
7949 if (!new)
7950 return NULL;
7951 if (y != NULL) {
7952 /* x must be a string too, of equal length */
7953 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
7954 if (!PyUnicode_Check(x)) {
7955 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
7956 "be a string if there is a second argument");
7957 goto err;
7958 }
7959 if (PyUnicode_GET_SIZE(x) != ylen) {
7960 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
7961 "arguments must have equal length");
7962 goto err;
7963 }
7964 /* create entries for translating chars in x to those in y */
7965 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007966 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
7967 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007968 if (!key || !value)
7969 goto err;
7970 res = PyDict_SetItem(new, key, value);
7971 Py_DECREF(key);
7972 Py_DECREF(value);
7973 if (res < 0)
7974 goto err;
7975 }
7976 /* create entries for deleting chars in z */
7977 if (z != NULL) {
7978 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007979 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007980 if (!key)
7981 goto err;
7982 res = PyDict_SetItem(new, key, Py_None);
7983 Py_DECREF(key);
7984 if (res < 0)
7985 goto err;
7986 }
7987 }
7988 } else {
7989 /* x must be a dict */
7990 if (!PyDict_Check(x)) {
7991 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
7992 "to maketrans it must be a dict");
7993 goto err;
7994 }
7995 /* copy entries into the new dict, converting string keys to int keys */
7996 while (PyDict_Next(x, &i, &key, &value)) {
7997 if (PyUnicode_Check(key)) {
7998 /* convert string keys to integer keys */
7999 PyObject *newkey;
8000 if (PyUnicode_GET_SIZE(key) != 1) {
8001 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8002 "table must be of length 1");
8003 goto err;
8004 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008005 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008006 if (!newkey)
8007 goto err;
8008 res = PyDict_SetItem(new, newkey, value);
8009 Py_DECREF(newkey);
8010 if (res < 0)
8011 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008012 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008013 /* just keep integer keys */
8014 if (PyDict_SetItem(new, key, value) < 0)
8015 goto err;
8016 } else {
8017 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8018 "be strings or integers");
8019 goto err;
8020 }
8021 }
8022 }
8023 return new;
8024 err:
8025 Py_DECREF(new);
8026 return NULL;
8027}
8028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008029PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030"S.translate(table) -> unicode\n\
8031\n\
8032Return a copy of the string S, where all characters have been mapped\n\
8033through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008034Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
8035Unmapped characters are left untouched. Characters mapped to None\n\
8036are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037
8038static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008039unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040{
Georg Brandlceee0772007-11-27 23:48:05 +00008041 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042}
8043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008044PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045"S.upper() -> unicode\n\
8046\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008047Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048
8049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008050unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 return fixup(self, fixupper);
8053}
8054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008055PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056"S.zfill(width) -> unicode\n\
8057\n\
8058Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008059of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060
8061static PyObject *
8062unicode_zfill(PyUnicodeObject *self, PyObject *args)
8063{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008064 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 PyUnicodeObject *u;
8066
Martin v. Löwis18e16552006-02-15 17:27:45 +00008067 Py_ssize_t width;
8068 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 return NULL;
8070
8071 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008072 if (PyUnicode_CheckExact(self)) {
8073 Py_INCREF(self);
8074 return (PyObject*) self;
8075 }
8076 else
8077 return PyUnicode_FromUnicode(
8078 PyUnicode_AS_UNICODE(self),
8079 PyUnicode_GET_SIZE(self)
8080 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 }
8082
8083 fill = width - self->length;
8084
8085 u = pad(self, fill, 0, '0');
8086
Walter Dörwald068325e2002-04-15 13:36:47 +00008087 if (u == NULL)
8088 return NULL;
8089
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 if (u->str[fill] == '+' || u->str[fill] == '-') {
8091 /* move sign to beginning of string */
8092 u->str[0] = u->str[fill];
8093 u->str[fill] = '0';
8094 }
8095
8096 return (PyObject*) u;
8097}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098
8099#if 0
8100static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008101unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102{
Christian Heimes2202f872008-02-06 14:31:34 +00008103 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104}
8105#endif
8106
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008107PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008108"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008110Return True if S starts with the specified prefix, False otherwise.\n\
8111With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008112With optional end, stop comparing S at that position.\n\
8113prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114
8115static PyObject *
8116unicode_startswith(PyUnicodeObject *self,
8117 PyObject *args)
8118{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008119 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008121 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008122 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008123 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008125 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008126 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008128 if (PyTuple_Check(subobj)) {
8129 Py_ssize_t i;
8130 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8131 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8132 PyTuple_GET_ITEM(subobj, i));
8133 if (substring == NULL)
8134 return NULL;
8135 result = tailmatch(self, substring, start, end, -1);
8136 Py_DECREF(substring);
8137 if (result) {
8138 Py_RETURN_TRUE;
8139 }
8140 }
8141 /* nothing matched */
8142 Py_RETURN_FALSE;
8143 }
8144 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008146 return NULL;
8147 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008149 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150}
8151
8152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008153PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008154"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008156Return True if S ends with the specified suffix, False otherwise.\n\
8157With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008158With optional end, stop comparing S at that position.\n\
8159suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160
8161static PyObject *
8162unicode_endswith(PyUnicodeObject *self,
8163 PyObject *args)
8164{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008165 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008167 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008168 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008169 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008171 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8172 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008174 if (PyTuple_Check(subobj)) {
8175 Py_ssize_t i;
8176 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8177 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8178 PyTuple_GET_ITEM(subobj, i));
8179 if (substring == NULL)
8180 return NULL;
8181 result = tailmatch(self, substring, start, end, +1);
8182 Py_DECREF(substring);
8183 if (result) {
8184 Py_RETURN_TRUE;
8185 }
8186 }
8187 Py_RETURN_FALSE;
8188 }
8189 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008193 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008195 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196}
8197
Eric Smith8c663262007-08-25 02:26:07 +00008198#include "stringlib/string_format.h"
8199
8200PyDoc_STRVAR(format__doc__,
8201"S.format(*args, **kwargs) -> unicode\n\
8202\n\
8203");
8204
Eric Smith8c663262007-08-25 02:26:07 +00008205PyDoc_STRVAR(p_format__doc__,
8206"S.__format__(format_spec) -> unicode\n\
8207\n\
8208");
8209
8210static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008211unicode_getnewargs(PyUnicodeObject *v)
8212{
8213 return Py_BuildValue("(u#)", v->str, v->length);
8214}
8215
8216
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217static PyMethodDef unicode_methods[] = {
8218
8219 /* Order is according to common usage: often used methods should
8220 appear first, since lookup is done sequentially. */
8221
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008222 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8223 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8224 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008225 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008226 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8227 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8228 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8229 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8230 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8231 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8232 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008233 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008234 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8235 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8236 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008237 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008238 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8239 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8240 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008241 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008242 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008243 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008244 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008245 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8246 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8247 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8248 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8249 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8250 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8251 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8252 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8253 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8254 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8255 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8256 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8257 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8258 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008259 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008260 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008261 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8262 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008263 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8264 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008265 {"maketrans", (PyCFunction) unicode_maketrans,
8266 METH_VARARGS | METH_STATIC, maketrans__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008267#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008268 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269#endif
8270
8271#if 0
8272 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008273 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274#endif
8275
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008276 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 {NULL, NULL}
8278};
8279
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008280static PyObject *
8281unicode_mod(PyObject *v, PyObject *w)
8282{
8283 if (!PyUnicode_Check(v)) {
8284 Py_INCREF(Py_NotImplemented);
8285 return Py_NotImplemented;
8286 }
8287 return PyUnicode_Format(v, w);
8288}
8289
8290static PyNumberMethods unicode_as_number = {
8291 0, /*nb_add*/
8292 0, /*nb_subtract*/
8293 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008294 unicode_mod, /*nb_remainder*/
8295};
8296
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008298 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008299 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008300 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8301 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008302 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 0, /* sq_ass_item */
8304 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008305 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306};
8307
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008308static PyObject*
8309unicode_subscript(PyUnicodeObject* self, PyObject* item)
8310{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008311 if (PyIndex_Check(item)) {
8312 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008313 if (i == -1 && PyErr_Occurred())
8314 return NULL;
8315 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008316 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008317 return unicode_getitem(self, i);
8318 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008319 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008320 Py_UNICODE* source_buf;
8321 Py_UNICODE* result_buf;
8322 PyObject* result;
8323
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008324 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008325 &start, &stop, &step, &slicelength) < 0) {
8326 return NULL;
8327 }
8328
8329 if (slicelength <= 0) {
8330 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008331 } else if (start == 0 && step == 1 && slicelength == self->length &&
8332 PyUnicode_CheckExact(self)) {
8333 Py_INCREF(self);
8334 return (PyObject *)self;
8335 } else if (step == 1) {
8336 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008337 } else {
8338 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008339 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8340 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008341
8342 if (result_buf == NULL)
8343 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008344
8345 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8346 result_buf[i] = source_buf[cur];
8347 }
Tim Petersced69f82003-09-16 20:30:58 +00008348
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008349 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008350 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008351 return result;
8352 }
8353 } else {
8354 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8355 return NULL;
8356 }
8357}
8358
8359static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008360 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008361 (binaryfunc)unicode_subscript, /* mp_subscript */
8362 (objobjargproc)0, /* mp_ass_subscript */
8363};
8364
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366/* Helpers for PyUnicode_Format() */
8367
8368static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008369getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008371 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 if (argidx < arglen) {
8373 (*p_argidx)++;
8374 if (arglen < 0)
8375 return args;
8376 else
8377 return PyTuple_GetItem(args, argidx);
8378 }
8379 PyErr_SetString(PyExc_TypeError,
8380 "not enough arguments for format string");
8381 return NULL;
8382}
8383
Martin v. Löwis18e16552006-02-15 17:27:45 +00008384static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008385strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008387 register Py_ssize_t i;
8388 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 for (i = len - 1; i >= 0; i--)
8390 buffer[i] = (Py_UNICODE) charbuffer[i];
8391
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392 return len;
8393}
8394
Neal Norwitzfc76d632006-01-10 06:03:13 +00008395static int
8396doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8397{
Tim Peters15231542006-02-16 01:08:01 +00008398 Py_ssize_t result;
8399
Neal Norwitzfc76d632006-01-10 06:03:13 +00008400 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008401 result = strtounicode(buffer, (char *)buffer);
8402 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008403}
8404
Christian Heimes3fd13992008-03-21 01:05:49 +00008405#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008406static int
8407longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8408{
Tim Peters15231542006-02-16 01:08:01 +00008409 Py_ssize_t result;
8410
Neal Norwitzfc76d632006-01-10 06:03:13 +00008411 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008412 result = strtounicode(buffer, (char *)buffer);
8413 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008414}
Christian Heimes3fd13992008-03-21 01:05:49 +00008415#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008416
Guido van Rossum078151d2002-08-11 04:24:12 +00008417/* XXX To save some code duplication, formatfloat/long/int could have been
8418 shared with stringobject.c, converting from 8-bit to Unicode after the
8419 formatting is done. */
8420
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421static int
8422formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008423 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424 int flags,
8425 int prec,
8426 int type,
8427 PyObject *v)
8428{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008429 /* fmt = '%#.' + `prec` + `type`
8430 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 char fmt[20];
8432 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008433
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 x = PyFloat_AsDouble(v);
8435 if (x == -1.0 && PyErr_Occurred())
8436 return -1;
8437 if (prec < 0)
8438 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8440 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008441 /* Worst case length calc to ensure no buffer overrun:
8442
8443 'g' formats:
8444 fmt = %#.<prec>g
8445 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8446 for any double rep.)
8447 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8448
8449 'f' formats:
8450 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8451 len = 1 + 50 + 1 + prec = 52 + prec
8452
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008453 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008454 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008455
8456 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008457 if (((type == 'g' || type == 'G') &&
8458 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008459 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008460 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008461 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008462 return -1;
8463 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008464 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8465 (flags&F_ALT) ? "#" : "",
8466 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008467 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468}
8469
Tim Peters38fd5b62000-09-21 05:43:11 +00008470static PyObject*
8471formatlong(PyObject *val, int flags, int prec, int type)
8472{
8473 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008474 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008475 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008476 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008477
8478 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8479 if (!str)
8480 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008481 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008482 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008483 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008484}
8485
Christian Heimes3fd13992008-03-21 01:05:49 +00008486#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487static int
8488formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008489 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 int flags,
8491 int prec,
8492 int type,
8493 PyObject *v)
8494{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008495 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008496 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8497 * + 1 + 1
8498 * = 24
8499 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008500 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008501 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502 long x;
8503
Christian Heimes217cfd12007-12-02 14:31:20 +00008504 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008506 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008507 if (x < 0 && type == 'u') {
8508 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008509 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008510 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8511 sign = "-";
8512 else
8513 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008515 prec = 1;
8516
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008517 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8518 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008519 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008520 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008521 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008522 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008523 return -1;
8524 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008525
8526 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008527 (type == 'x' || type == 'X' || type == 'o')) {
8528 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008529 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008530 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008531 * - when 0 is being converted, the C standard leaves off
8532 * the '0x' or '0X', which is inconsistent with other
8533 * %#x/%#X conversions and inconsistent with Python's
8534 * hex() function
8535 * - there are platforms that violate the standard and
8536 * convert 0 with the '0x' or '0X'
8537 * (Metrowerks, Compaq Tru64)
8538 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008539 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008540 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008541 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008542 * We can achieve the desired consistency by inserting our
8543 * own '0x' or '0X' prefix, and substituting %x/%X in place
8544 * of %#x/%#X.
8545 *
8546 * Note that this is the same approach as used in
8547 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008548 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008549 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8550 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008551 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008552 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008553 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8554 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008555 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008556 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008557 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008558 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008559 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008560 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561}
Christian Heimes3fd13992008-03-21 01:05:49 +00008562#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563
8564static int
8565formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008566 size_t buflen,
8567 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008569 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008570 if (PyUnicode_Check(v)) {
8571 if (PyUnicode_GET_SIZE(v) != 1)
8572 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 else {
8576 /* Integer input truncated to a character */
8577 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008578 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008580 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008581#ifdef Py_UNICODE_WIDE
8582 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008583 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008584 "%c arg not in range(0x110000) "
8585 "(wide Python build)");
8586 return -1;
8587 }
8588#else
8589 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008590 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008591 "%c arg not in range(0x10000) "
8592 "(narrow Python build)");
8593 return -1;
8594 }
8595#endif
8596 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 }
8598 buf[1] = '\0';
8599 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008600
8601 onError:
8602 PyErr_SetString(PyExc_TypeError,
8603 "%c requires int or char");
8604 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605}
8606
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008607/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8608
8609 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8610 chars are formatted. XXX This is a magic number. Each formatting
8611 routine does bounds checking to ensure no overflow, but a better
8612 solution may be to malloc a buffer of appropriate size for each
8613 format. For now, the current solution is sufficient.
8614*/
8615#define FORMATBUFLEN (size_t)120
8616
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617PyObject *PyUnicode_Format(PyObject *format,
8618 PyObject *args)
8619{
8620 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008621 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 int args_owned = 0;
8623 PyUnicodeObject *result = NULL;
8624 PyObject *dict = NULL;
8625 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008626
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 if (format == NULL || args == NULL) {
8628 PyErr_BadInternalCall();
8629 return NULL;
8630 }
8631 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008632 if (uformat == NULL)
8633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 fmt = PyUnicode_AS_UNICODE(uformat);
8635 fmtcnt = PyUnicode_GET_SIZE(uformat);
8636
8637 reslen = rescnt = fmtcnt + 100;
8638 result = _PyUnicode_New(reslen);
8639 if (result == NULL)
8640 goto onError;
8641 res = PyUnicode_AS_UNICODE(result);
8642
8643 if (PyTuple_Check(args)) {
8644 arglen = PyTuple_Size(args);
8645 argidx = 0;
8646 }
8647 else {
8648 arglen = -1;
8649 argidx = -2;
8650 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008651 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008652 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 dict = args;
8654
8655 while (--fmtcnt >= 0) {
8656 if (*fmt != '%') {
8657 if (--rescnt < 0) {
8658 rescnt = fmtcnt + 100;
8659 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008660 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008661 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8663 --rescnt;
8664 }
8665 *res++ = *fmt++;
8666 }
8667 else {
8668 /* Got a format specifier */
8669 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008670 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 Py_UNICODE c = '\0';
8673 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008674 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 PyObject *v = NULL;
8676 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008677 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008679 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008680 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681
8682 fmt++;
8683 if (*fmt == '(') {
8684 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008685 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 PyObject *key;
8687 int pcount = 1;
8688
8689 if (dict == NULL) {
8690 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008691 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 goto onError;
8693 }
8694 ++fmt;
8695 --fmtcnt;
8696 keystart = fmt;
8697 /* Skip over balanced parentheses */
8698 while (pcount > 0 && --fmtcnt >= 0) {
8699 if (*fmt == ')')
8700 --pcount;
8701 else if (*fmt == '(')
8702 ++pcount;
8703 fmt++;
8704 }
8705 keylen = fmt - keystart - 1;
8706 if (fmtcnt < 0 || pcount > 0) {
8707 PyErr_SetString(PyExc_ValueError,
8708 "incomplete format key");
8709 goto onError;
8710 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008711#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008712 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 then looked up since Python uses strings to hold
8714 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008715 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 key = PyUnicode_EncodeUTF8(keystart,
8717 keylen,
8718 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008719#else
8720 key = PyUnicode_FromUnicode(keystart, keylen);
8721#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 if (key == NULL)
8723 goto onError;
8724 if (args_owned) {
8725 Py_DECREF(args);
8726 args_owned = 0;
8727 }
8728 args = PyObject_GetItem(dict, key);
8729 Py_DECREF(key);
8730 if (args == NULL) {
8731 goto onError;
8732 }
8733 args_owned = 1;
8734 arglen = -1;
8735 argidx = -2;
8736 }
8737 while (--fmtcnt >= 0) {
8738 switch (c = *fmt++) {
8739 case '-': flags |= F_LJUST; continue;
8740 case '+': flags |= F_SIGN; continue;
8741 case ' ': flags |= F_BLANK; continue;
8742 case '#': flags |= F_ALT; continue;
8743 case '0': flags |= F_ZERO; continue;
8744 }
8745 break;
8746 }
8747 if (c == '*') {
8748 v = getnextarg(args, arglen, &argidx);
8749 if (v == NULL)
8750 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008751 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 PyErr_SetString(PyExc_TypeError,
8753 "* wants int");
8754 goto onError;
8755 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008756 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008757 if (width == -1 && PyErr_Occurred())
8758 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 if (width < 0) {
8760 flags |= F_LJUST;
8761 width = -width;
8762 }
8763 if (--fmtcnt >= 0)
8764 c = *fmt++;
8765 }
8766 else if (c >= '0' && c <= '9') {
8767 width = c - '0';
8768 while (--fmtcnt >= 0) {
8769 c = *fmt++;
8770 if (c < '0' || c > '9')
8771 break;
8772 if ((width*10) / 10 != width) {
8773 PyErr_SetString(PyExc_ValueError,
8774 "width too big");
8775 goto onError;
8776 }
8777 width = width*10 + (c - '0');
8778 }
8779 }
8780 if (c == '.') {
8781 prec = 0;
8782 if (--fmtcnt >= 0)
8783 c = *fmt++;
8784 if (c == '*') {
8785 v = getnextarg(args, arglen, &argidx);
8786 if (v == NULL)
8787 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008788 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 PyErr_SetString(PyExc_TypeError,
8790 "* wants int");
8791 goto onError;
8792 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008793 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008794 if (prec == -1 && PyErr_Occurred())
8795 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 if (prec < 0)
8797 prec = 0;
8798 if (--fmtcnt >= 0)
8799 c = *fmt++;
8800 }
8801 else if (c >= '0' && c <= '9') {
8802 prec = c - '0';
8803 while (--fmtcnt >= 0) {
8804 c = Py_CHARMASK(*fmt++);
8805 if (c < '0' || c > '9')
8806 break;
8807 if ((prec*10) / 10 != prec) {
8808 PyErr_SetString(PyExc_ValueError,
8809 "prec too big");
8810 goto onError;
8811 }
8812 prec = prec*10 + (c - '0');
8813 }
8814 }
8815 } /* prec */
8816 if (fmtcnt >= 0) {
8817 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 if (--fmtcnt >= 0)
8819 c = *fmt++;
8820 }
8821 }
8822 if (fmtcnt < 0) {
8823 PyErr_SetString(PyExc_ValueError,
8824 "incomplete format");
8825 goto onError;
8826 }
8827 if (c != '%') {
8828 v = getnextarg(args, arglen, &argidx);
8829 if (v == NULL)
8830 goto onError;
8831 }
8832 sign = 0;
8833 fill = ' ';
8834 switch (c) {
8835
8836 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008837 pbuf = formatbuf;
8838 /* presume that buffer length is at least 1 */
8839 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 len = 1;
8841 break;
8842
8843 case 's':
8844 case 'r':
8845 if (PyUnicode_Check(v) && c == 's') {
8846 temp = v;
8847 Py_INCREF(temp);
8848 }
8849 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008851 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 else
8853 temp = PyObject_Repr(v);
8854 if (temp == NULL)
8855 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008856 if (PyUnicode_Check(temp))
8857 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008858 else {
8859 Py_DECREF(temp);
8860 PyErr_SetString(PyExc_TypeError,
8861 "%s argument has non-string str()");
8862 goto onError;
8863 }
8864 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008865 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866 len = PyUnicode_GET_SIZE(temp);
8867 if (prec >= 0 && len > prec)
8868 len = prec;
8869 break;
8870
8871 case 'i':
8872 case 'd':
8873 case 'u':
8874 case 'o':
8875 case 'x':
8876 case 'X':
8877 if (c == 'i')
8878 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00008879 isnumok = 0;
8880 if (PyNumber_Check(v)) {
8881 PyObject *iobj=NULL;
8882
8883 if (PyLong_Check(v)) {
8884 iobj = v;
8885 Py_INCREF(iobj);
8886 }
8887 else {
8888 iobj = PyNumber_Long(v);
8889 }
8890 if (iobj!=NULL) {
8891 if (PyLong_Check(iobj)) {
8892 isnumok = 1;
8893 temp = formatlong(iobj, flags, prec, c);
8894 Py_DECREF(iobj);
8895 if (!temp)
8896 goto onError;
8897 pbuf = PyUnicode_AS_UNICODE(temp);
8898 len = PyUnicode_GET_SIZE(temp);
8899 sign = 1;
8900 }
8901 else {
8902 Py_DECREF(iobj);
8903 }
8904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 }
Christian Heimesa612dc02008-02-24 13:08:18 +00008906 if (!isnumok) {
8907 PyErr_Format(PyExc_TypeError,
8908 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00008909 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008910 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008911 }
8912 if (flags & F_ZERO)
8913 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 break;
8915
8916 case 'e':
8917 case 'E':
8918 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008919 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920 case 'g':
8921 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008922 if (c == 'F')
8923 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008924 pbuf = formatbuf;
8925 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8926 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927 if (len < 0)
8928 goto onError;
8929 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008930 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 fill = '0';
8932 break;
8933
8934 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008935 pbuf = formatbuf;
8936 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 if (len < 0)
8938 goto onError;
8939 break;
8940
8941 default:
8942 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008943 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008944 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008945 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008946 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008947 (Py_ssize_t)(fmt - 1 -
8948 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 goto onError;
8950 }
8951 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008952 if (*pbuf == '-' || *pbuf == '+') {
8953 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954 len--;
8955 }
8956 else if (flags & F_SIGN)
8957 sign = '+';
8958 else if (flags & F_BLANK)
8959 sign = ' ';
8960 else
8961 sign = 0;
8962 }
8963 if (width < len)
8964 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008965 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 reslen -= rescnt;
8967 rescnt = width + fmtcnt + 100;
8968 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008969 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008970 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008971 PyErr_NoMemory();
8972 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008973 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008974 if (_PyUnicode_Resize(&result, reslen) < 0) {
8975 Py_XDECREF(temp);
8976 goto onError;
8977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 res = PyUnicode_AS_UNICODE(result)
8979 + reslen - rescnt;
8980 }
8981 if (sign) {
8982 if (fill != ' ')
8983 *res++ = sign;
8984 rescnt--;
8985 if (width > len)
8986 width--;
8987 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008988 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008989 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008990 assert(pbuf[1] == c);
8991 if (fill != ' ') {
8992 *res++ = *pbuf++;
8993 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008994 }
Tim Petersfff53252001-04-12 18:38:48 +00008995 rescnt -= 2;
8996 width -= 2;
8997 if (width < 0)
8998 width = 0;
8999 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001 if (width > len && !(flags & F_LJUST)) {
9002 do {
9003 --rescnt;
9004 *res++ = fill;
9005 } while (--width > len);
9006 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009007 if (fill == ' ') {
9008 if (sign)
9009 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009010 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009011 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009012 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009013 *res++ = *pbuf++;
9014 *res++ = *pbuf++;
9015 }
9016 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009017 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 res += len;
9019 rescnt -= len;
9020 while (--width >= len) {
9021 --rescnt;
9022 *res++ = ' ';
9023 }
9024 if (dict && (argidx < arglen) && c != '%') {
9025 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009026 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009027 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 goto onError;
9029 }
9030 Py_XDECREF(temp);
9031 } /* '%' */
9032 } /* until end */
9033 if (argidx < arglen && !dict) {
9034 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009035 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 goto onError;
9037 }
9038
Thomas Woutersa96affe2006-03-12 00:29:36 +00009039 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9040 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 if (args_owned) {
9042 Py_DECREF(args);
9043 }
9044 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045 return (PyObject *)result;
9046
9047 onError:
9048 Py_XDECREF(result);
9049 Py_DECREF(uformat);
9050 if (args_owned) {
9051 Py_DECREF(args);
9052 }
9053 return NULL;
9054}
9055
Jeremy Hylton938ace62002-07-17 16:30:39 +00009056static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009057unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9058
Tim Peters6d6c1a32001-08-02 04:15:00 +00009059static PyObject *
9060unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9061{
9062 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009063 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009064 char *encoding = NULL;
9065 char *errors = NULL;
9066
Guido van Rossume023fe02001-08-30 03:12:59 +00009067 if (type != &PyUnicode_Type)
9068 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009069 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009070 kwlist, &x, &encoding, &errors))
9071 return NULL;
9072 if (x == NULL)
9073 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009074 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009075 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009076 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009077 return PyUnicode_FromEncodedObject(x, encoding, errors);
9078}
9079
Guido van Rossume023fe02001-08-30 03:12:59 +00009080static PyObject *
9081unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9082{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009083 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009084 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009085
9086 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9087 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9088 if (tmp == NULL)
9089 return NULL;
9090 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009091 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009092 if (pnew == NULL) {
9093 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009094 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009095 }
Christian Heimesb186d002008-03-18 15:15:01 +00009096 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009097 if (pnew->str == NULL) {
9098 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009099 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009100 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009101 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009102 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009103 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9104 pnew->length = n;
9105 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009106 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009107 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009108}
9109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009110PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00009111"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009112\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009113Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009114encoding defaults to the current default string encoding.\n\
9115errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009116
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009117static PyObject *unicode_iter(PyObject *seq);
9118
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009120 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009121 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 sizeof(PyUnicodeObject), /* tp_size */
9123 0, /* tp_itemsize */
9124 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009125 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009127 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009129 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009130 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009131 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009133 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 (hashfunc) unicode_hash, /* tp_hash*/
9135 0, /* tp_call*/
9136 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009137 PyObject_GenericGetAttr, /* tp_getattro */
9138 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009139 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009140 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9141 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009142 unicode_doc, /* tp_doc */
9143 0, /* tp_traverse */
9144 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009145 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009146 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009147 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009148 0, /* tp_iternext */
9149 unicode_methods, /* tp_methods */
9150 0, /* tp_members */
9151 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009152 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009153 0, /* tp_dict */
9154 0, /* tp_descr_get */
9155 0, /* tp_descr_set */
9156 0, /* tp_dictoffset */
9157 0, /* tp_init */
9158 0, /* tp_alloc */
9159 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009160 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161};
9162
9163/* Initialize the Unicode implementation */
9164
Thomas Wouters78890102000-07-22 19:25:51 +00009165void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009167 int i;
9168
Thomas Wouters477c8d52006-05-27 19:21:47 +00009169 /* XXX - move this array to unicodectype.c ? */
9170 Py_UNICODE linebreak[] = {
9171 0x000A, /* LINE FEED */
9172 0x000D, /* CARRIAGE RETURN */
9173 0x001C, /* FILE SEPARATOR */
9174 0x001D, /* GROUP SEPARATOR */
9175 0x001E, /* RECORD SEPARATOR */
9176 0x0085, /* NEXT LINE */
9177 0x2028, /* LINE SEPARATOR */
9178 0x2029, /* PARAGRAPH SEPARATOR */
9179 };
9180
Fred Drakee4315f52000-05-09 19:53:39 +00009181 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009182 free_list = NULL;
9183 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009185 if (!unicode_empty)
9186 return;
9187
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009188 for (i = 0; i < 256; i++)
9189 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009190 if (PyType_Ready(&PyUnicode_Type) < 0)
9191 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009192
9193 /* initialize the linebreak bloom filter */
9194 bloom_linebreak = make_bloom_mask(
9195 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9196 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009197
9198 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009199}
9200
9201/* Finalize the Unicode implementation */
9202
Christian Heimesa156e092008-02-16 07:38:31 +00009203int
9204PyUnicode_ClearFreeList(void)
9205{
9206 int freelist_size = numfree;
9207 PyUnicodeObject *u;
9208
9209 for (u = free_list; u != NULL;) {
9210 PyUnicodeObject *v = u;
9211 u = *(PyUnicodeObject **)u;
9212 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009213 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009214 Py_XDECREF(v->defenc);
9215 PyObject_Del(v);
9216 numfree--;
9217 }
9218 free_list = NULL;
9219 assert(numfree == 0);
9220 return freelist_size;
9221}
9222
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223void
Thomas Wouters78890102000-07-22 19:25:51 +00009224_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009226 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009228 Py_XDECREF(unicode_empty);
9229 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009231 for (i = 0; i < 256; i++) {
9232 if (unicode_latin1[i]) {
9233 Py_DECREF(unicode_latin1[i]);
9234 unicode_latin1[i] = NULL;
9235 }
9236 }
Christian Heimesa156e092008-02-16 07:38:31 +00009237 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009239
Walter Dörwald16807132007-05-25 13:52:07 +00009240void
9241PyUnicode_InternInPlace(PyObject **p)
9242{
9243 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9244 PyObject *t;
9245 if (s == NULL || !PyUnicode_Check(s))
9246 Py_FatalError(
9247 "PyUnicode_InternInPlace: unicode strings only please!");
9248 /* If it's a subclass, we don't really know what putting
9249 it in the interned dict might do. */
9250 if (!PyUnicode_CheckExact(s))
9251 return;
9252 if (PyUnicode_CHECK_INTERNED(s))
9253 return;
9254 if (interned == NULL) {
9255 interned = PyDict_New();
9256 if (interned == NULL) {
9257 PyErr_Clear(); /* Don't leave an exception */
9258 return;
9259 }
9260 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009261 /* It might be that the GetItem call fails even
9262 though the key is present in the dictionary,
9263 namely when this happens during a stack overflow. */
9264 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009265 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009266 Py_END_ALLOW_RECURSION
9267
Walter Dörwald16807132007-05-25 13:52:07 +00009268 if (t) {
9269 Py_INCREF(t);
9270 Py_DECREF(*p);
9271 *p = t;
9272 return;
9273 }
9274
Martin v. Löwis5b222132007-06-10 09:51:05 +00009275 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009276 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9277 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009278 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009279 return;
9280 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009281 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009282 /* The two references in interned are not counted by refcnt.
9283 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009284 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009285 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9286}
9287
9288void
9289PyUnicode_InternImmortal(PyObject **p)
9290{
9291 PyUnicode_InternInPlace(p);
9292 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9293 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9294 Py_INCREF(*p);
9295 }
9296}
9297
9298PyObject *
9299PyUnicode_InternFromString(const char *cp)
9300{
9301 PyObject *s = PyUnicode_FromString(cp);
9302 if (s == NULL)
9303 return NULL;
9304 PyUnicode_InternInPlace(&s);
9305 return s;
9306}
9307
9308void _Py_ReleaseInternedUnicodeStrings(void)
9309{
9310 PyObject *keys;
9311 PyUnicodeObject *s;
9312 Py_ssize_t i, n;
9313 Py_ssize_t immortal_size = 0, mortal_size = 0;
9314
9315 if (interned == NULL || !PyDict_Check(interned))
9316 return;
9317 keys = PyDict_Keys(interned);
9318 if (keys == NULL || !PyList_Check(keys)) {
9319 PyErr_Clear();
9320 return;
9321 }
9322
9323 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9324 detector, interned unicode strings are not forcibly deallocated;
9325 rather, we give them their stolen references back, and then clear
9326 and DECREF the interned dict. */
9327
9328 n = PyList_GET_SIZE(keys);
9329 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9330 n);
9331 for (i = 0; i < n; i++) {
9332 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9333 switch (s->state) {
9334 case SSTATE_NOT_INTERNED:
9335 /* XXX Shouldn't happen */
9336 break;
9337 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009338 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009339 immortal_size += s->length;
9340 break;
9341 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009342 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009343 mortal_size += s->length;
9344 break;
9345 default:
9346 Py_FatalError("Inconsistent interned string state.");
9347 }
9348 s->state = SSTATE_NOT_INTERNED;
9349 }
9350 fprintf(stderr, "total size of all interned strings: "
9351 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9352 "mortal/immortal\n", mortal_size, immortal_size);
9353 Py_DECREF(keys);
9354 PyDict_Clear(interned);
9355 Py_DECREF(interned);
9356 interned = NULL;
9357}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009358
9359
9360/********************* Unicode Iterator **************************/
9361
9362typedef struct {
9363 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009364 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009365 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9366} unicodeiterobject;
9367
9368static void
9369unicodeiter_dealloc(unicodeiterobject *it)
9370{
9371 _PyObject_GC_UNTRACK(it);
9372 Py_XDECREF(it->it_seq);
9373 PyObject_GC_Del(it);
9374}
9375
9376static int
9377unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9378{
9379 Py_VISIT(it->it_seq);
9380 return 0;
9381}
9382
9383static PyObject *
9384unicodeiter_next(unicodeiterobject *it)
9385{
9386 PyUnicodeObject *seq;
9387 PyObject *item;
9388
9389 assert(it != NULL);
9390 seq = it->it_seq;
9391 if (seq == NULL)
9392 return NULL;
9393 assert(PyUnicode_Check(seq));
9394
9395 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009396 item = PyUnicode_FromUnicode(
9397 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009398 if (item != NULL)
9399 ++it->it_index;
9400 return item;
9401 }
9402
9403 Py_DECREF(seq);
9404 it->it_seq = NULL;
9405 return NULL;
9406}
9407
9408static PyObject *
9409unicodeiter_len(unicodeiterobject *it)
9410{
9411 Py_ssize_t len = 0;
9412 if (it->it_seq)
9413 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009414 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009415}
9416
9417PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9418
9419static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009420 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9421 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009422 {NULL, NULL} /* sentinel */
9423};
9424
9425PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009426 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009427 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009428 sizeof(unicodeiterobject), /* tp_basicsize */
9429 0, /* tp_itemsize */
9430 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009431 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009432 0, /* tp_print */
9433 0, /* tp_getattr */
9434 0, /* tp_setattr */
9435 0, /* tp_compare */
9436 0, /* tp_repr */
9437 0, /* tp_as_number */
9438 0, /* tp_as_sequence */
9439 0, /* tp_as_mapping */
9440 0, /* tp_hash */
9441 0, /* tp_call */
9442 0, /* tp_str */
9443 PyObject_GenericGetAttr, /* tp_getattro */
9444 0, /* tp_setattro */
9445 0, /* tp_as_buffer */
9446 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9447 0, /* tp_doc */
9448 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9449 0, /* tp_clear */
9450 0, /* tp_richcompare */
9451 0, /* tp_weaklistoffset */
9452 PyObject_SelfIter, /* tp_iter */
9453 (iternextfunc)unicodeiter_next, /* tp_iternext */
9454 unicodeiter_methods, /* tp_methods */
9455 0,
9456};
9457
9458static PyObject *
9459unicode_iter(PyObject *seq)
9460{
9461 unicodeiterobject *it;
9462
9463 if (!PyUnicode_Check(seq)) {
9464 PyErr_BadInternalCall();
9465 return NULL;
9466 }
9467 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9468 if (it == NULL)
9469 return NULL;
9470 it->it_index = 0;
9471 Py_INCREF(seq);
9472 it->it_seq = (PyUnicodeObject *)seq;
9473 _PyObject_GC_TRACK(it);
9474 return (PyObject *)it;
9475}
9476
Martin v. Löwis5b222132007-06-10 09:51:05 +00009477size_t
9478Py_UNICODE_strlen(const Py_UNICODE *u)
9479{
9480 int res = 0;
9481 while(*u++)
9482 res++;
9483 return res;
9484}
9485
9486Py_UNICODE*
9487Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9488{
9489 Py_UNICODE *u = s1;
9490 while ((*u++ = *s2++));
9491 return s1;
9492}
9493
9494Py_UNICODE*
9495Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9496{
9497 Py_UNICODE *u = s1;
9498 while ((*u++ = *s2++))
9499 if (n-- == 0)
9500 break;
9501 return s1;
9502}
9503
9504int
9505Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9506{
9507 while (*s1 && *s2 && *s1 == *s2)
9508 s1++, s2++;
9509 if (*s1 && *s2)
9510 return (*s1 < *s2) ? -1 : +1;
9511 if (*s1)
9512 return 1;
9513 if (*s2)
9514 return -1;
9515 return 0;
9516}
9517
9518Py_UNICODE*
9519Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9520{
9521 const Py_UNICODE *p;
9522 for (p = s; *p; p++)
9523 if (*p == c)
9524 return (Py_UNICODE*)p;
9525 return NULL;
9526}
9527
9528
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009529#ifdef __cplusplus
9530}
9531#endif
9532
9533
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009534/*
9535Local variables:
9536c-basic-offset: 4
9537indent-tabs-mode: nil
9538End:
9539*/