blob: fc3ef76efdf511ab1abd76fa3c3751add0529fbc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
Christian Heimes2202f872008-02-06 14:31:34 +000057#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Christian Heimes2202f872008-02-06 14:31:34 +000065 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000109static PyUnicodeObject *free_list;
110static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes190d79e2008-01-30 11:58:22 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131// case 0x0009: /* HORIZONTAL TABULATION */
132// case 0x000A: /* LINE FEED */
133// case 0x000B: /* VERTICAL TABULATION */
134// case 0x000C: /* FORM FEED */
135// case 0x000D: /* CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138// case 0x001C: /* FILE SEPARATOR */
139// case 0x001D: /* GROUP SEPARATOR */
140// case 0x001E: /* RECORD SEPARATOR */
141// case 0x001F: /* UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143// case 0x0020: /* SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162// 0x000A, /* LINE FEED */
163// 0x000D, /* CARRIAGE RETURN */
164 0, 0, 1, 0, 0, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166// 0x001C, /* FILE SEPARATOR */
167// 0x001D, /* GROUP SEPARATOR */
168// 0x001E, /* RECORD SEPARATOR */
169 0, 0, 0, 0, 1, 1, 1, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0
183};
184
185
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000187PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000189#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190 return 0x10FFFF;
191#else
192 /* This is actually an illegal character, so it should
193 not be passed to unichr. */
194 return 0xFFFF;
195#endif
196}
197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198/* --- Bloom Filters ----------------------------------------------------- */
199
200/* stuff to implement simple "bloom filters" for Unicode characters.
201 to keep things simple, we use a single bitmask, using the least 5
202 bits from each unicode characters as the bit index. */
203
204/* the linebreak mask is set up by Unicode_Init below */
205
206#define BLOOM_MASK unsigned long
207
208static BLOOM_MASK bloom_linebreak;
209
210#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
211
Christian Heimes190d79e2008-01-30 11:58:22 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
220 long mask;
221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
225 mask |= (1 << (ptr[i] & 0x1F));
226
227 return mask;
228}
229
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
238 return 0;
239}
240
241#define BLOOM_MEMBER(mask, chr, set, setlen)\
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 if (unicode == unicode_empty ||
261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000297 Ux0000 terminated; some code (e.g. new_identifier)
298 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299
300 XXX This allocator could further be enhanced by assuring that the
301 free list never reduces its size below 1.
302
303*/
304
305static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000306PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307{
308 register PyUnicodeObject *unicode;
309
Thomas Wouters477c8d52006-05-27 19:21:47 +0000310 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 if (length == 0 && unicode_empty != NULL) {
312 Py_INCREF(unicode_empty);
313 return unicode_empty;
314 }
315
316 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000317 if (free_list) {
318 unicode = free_list;
319 free_list = *(PyUnicodeObject **)unicode;
320 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000322 /* Keep-Alive optimization: we only upsize the buffer,
323 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000324 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000325 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000326 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328 }
329 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000330 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000331 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
332 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 }
334 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 }
336 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000337 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000338 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 if (unicode == NULL)
340 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000341 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
342 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000345 if (!unicode->str) {
346 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000347 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000349 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000350 * the caller fails before initializing str -- unicode_resize()
351 * reads str[0], and the Keep-Alive optimization can keep memory
352 * allocated for str alive across a call to unicode_dealloc(unicode).
353 * We don't want unicode_resize to read uninitialized memory in
354 * that case.
355 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000356 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000360 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000361 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000363
364 onError:
365 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000366 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368}
369
370static
Guido van Rossum9475a232001-10-05 20:51:39 +0000371void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372{
Walter Dörwald16807132007-05-25 13:52:07 +0000373 switch (PyUnicode_CHECK_INTERNED(unicode)) {
374 case SSTATE_NOT_INTERNED:
375 break;
376
377 case SSTATE_INTERNED_MORTAL:
378 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000379 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000380 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
381 Py_FatalError(
382 "deletion of interned unicode string failed");
383 break;
384
385 case SSTATE_INTERNED_IMMORTAL:
386 Py_FatalError("Immortal interned unicode string died.");
387
388 default:
389 Py_FatalError("Inconsistent interned unicode string state.");
390 }
391
Guido van Rossum604ddf82001-12-06 20:03:56 +0000392 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000393 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000394 /* Keep-Alive optimization */
395 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000396 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 unicode->str = NULL;
398 unicode->length = 0;
399 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000400 if (unicode->defenc) {
401 Py_DECREF(unicode->defenc);
402 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000403 }
404 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000405 *(PyUnicodeObject **)unicode = free_list;
406 free_list = unicode;
407 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000410 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000411 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000412 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414}
415
Martin v. Löwis18e16552006-02-15 17:27:45 +0000416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000417{
418 register PyUnicodeObject *v;
419
420 /* Argument checks */
421 if (unicode == NULL) {
422 PyErr_BadInternalCall();
423 return -1;
424 }
425 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000426 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 PyErr_BadInternalCall();
428 return -1;
429 }
430
431 /* Resizing unicode_empty and single character objects is not
432 possible since these are being shared. We simply return a fresh
433 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000434 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 (v == unicode_empty || v->length == 1)) {
436 PyUnicodeObject *w = _PyUnicode_New(length);
437 if (w == NULL)
438 return -1;
439 Py_UNICODE_COPY(w->str, v->str,
440 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000441 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 *unicode = (PyObject *)w;
443 return 0;
444 }
445
446 /* Note that we don't have to modify *unicode for unshared Unicode
447 objects, since we can modify them in-place. */
448 return unicode_resize(v, length);
449}
450
451/* Internal API for use in unicodeobject.c only ! */
452#define _PyUnicode_Resize(unicodevar, length) \
453 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000456 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457{
458 PyUnicodeObject *unicode;
459
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 /* If the Unicode data is known at construction time, we can apply
461 some optimizations which share commonly used objects. */
462 if (u != NULL) {
463
464 /* Optimization for empty strings */
465 if (size == 0 && unicode_empty != NULL) {
466 Py_INCREF(unicode_empty);
467 return (PyObject *)unicode_empty;
468 }
469
470 /* Single character Unicode objects in the Latin-1 range are
471 shared when using this constructor */
472 if (size == 1 && *u < 256) {
473 unicode = unicode_latin1[*u];
474 if (!unicode) {
475 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476 if (!unicode)
477 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000478 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479 unicode_latin1[*u] = unicode;
480 }
481 Py_INCREF(unicode);
482 return (PyObject *)unicode;
483 }
484 }
Tim Petersced69f82003-09-16 20:30:58 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 unicode = _PyUnicode_New(size);
487 if (!unicode)
488 return NULL;
489
490 /* Copy the Unicode data into the new object */
491 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000492 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493
494 return (PyObject *)unicode;
495}
496
Walter Dörwaldd2034312007-05-18 16:29:38 +0000497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000498{
499 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000500 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000501 some optimizations which share commonly used objects.
502 Also, this means the input must be UTF-8, so fall back to the
503 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000504 if (u != NULL) {
505
506 /* Optimization for empty strings */
507 if (size == 0 && unicode_empty != NULL) {
508 Py_INCREF(unicode_empty);
509 return (PyObject *)unicode_empty;
510 }
511
Martin v. Löwis9c121062007-08-05 20:26:11 +0000512 /* Single characters are shared when using this constructor.
513 Restrict to ASCII, since the input must be UTF-8. */
514 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000515 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000516 if (!unicode) {
517 unicode = _PyUnicode_New(1);
518 if (!unicode)
519 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000520 unicode->str[0] = Py_CHARMASK(*u);
521 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000522 }
523 Py_INCREF(unicode);
524 return (PyObject *)unicode;
525 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000526
527 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 }
529
Walter Dörwald55507312007-05-18 13:12:10 +0000530 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (!unicode)
532 return NULL;
533
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 return (PyObject *)unicode;
535}
536
Walter Dörwaldd2034312007-05-18 16:29:38 +0000537PyObject *PyUnicode_FromString(const char *u)
538{
539 size_t size = strlen(u);
540 if (size > PY_SSIZE_T_MAX) {
541 PyErr_SetString(PyExc_OverflowError, "input too long");
542 return NULL;
543 }
544
545 return PyUnicode_FromStringAndSize(u, size);
546}
547
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548#ifdef HAVE_WCHAR_H
549
550PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000551 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552{
553 PyUnicodeObject *unicode;
554
555 if (w == NULL) {
556 PyErr_BadInternalCall();
557 return NULL;
558 }
559
560 unicode = _PyUnicode_New(size);
561 if (!unicode)
562 return NULL;
563
564 /* Copy the wchar_t data into the new object */
565#ifdef HAVE_USABLE_WCHAR_T
566 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000567#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000568 {
569 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000570 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000572 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 *u++ = *w++;
574 }
575#endif
576
577 return (PyObject *)unicode;
578}
579
Walter Dörwald346737f2007-05-31 10:44:43 +0000580static void
581makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
582{
583 *fmt++ = '%';
584 if (width) {
585 if (zeropad)
586 *fmt++ = '0';
587 fmt += sprintf(fmt, "%d", width);
588 }
589 if (precision)
590 fmt += sprintf(fmt, ".%d", precision);
591 if (longflag)
592 *fmt++ = 'l';
593 else if (size_tflag) {
594 char *f = PY_FORMAT_SIZE_T;
595 while (*f)
596 *fmt++ = *f++;
597 }
598 *fmt++ = c;
599 *fmt = '\0';
600}
601
Walter Dörwaldd2034312007-05-18 16:29:38 +0000602#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
603
604PyObject *
605PyUnicode_FromFormatV(const char *format, va_list vargs)
606{
607 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000608 Py_ssize_t callcount = 0;
609 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000610 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000611 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000612 int width = 0;
613 int precision = 0;
614 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000615 const char* f;
616 Py_UNICODE *s;
617 PyObject *string;
618 /* used by sprintf */
619 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000620 /* use abuffer instead of buffer, if we need more space
621 * (which can happen if there's a format specifier with width). */
622 char *abuffer = NULL;
623 char *realbuffer;
624 Py_ssize_t abuffersize = 0;
625 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000626 const char *copy;
627
628#ifdef VA_LIST_IS_ARRAY
629 Py_MEMCPY(count, vargs, sizeof(va_list));
630#else
631#ifdef __va_copy
632 __va_copy(count, vargs);
633#else
634 count = vargs;
635#endif
636#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000637 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000638 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000639 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000640 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000641 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000642 ++callcount;
643 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000644 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000645 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000646 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000647 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000648 if (!callresults) {
649 PyErr_NoMemory();
650 return NULL;
651 }
652 callresult = callresults;
653 }
654 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000655 for (f = format; *f; f++) {
656 if (*f == '%') {
657 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000658 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000659 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000660 width = (width*10) + *f++ - '0';
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000661 while (*++f && *f != '%' && !ISALPHA(*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662 ;
663
664 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
665 * they don't affect the amount of space we reserve.
666 */
667 if ((*f == 'l' || *f == 'z') &&
668 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000669 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000670
671 switch (*f) {
672 case 'c':
673 (void)va_arg(count, int);
674 /* fall through... */
675 case '%':
676 n++;
677 break;
678 case 'd': case 'u': case 'i': case 'x':
679 (void) va_arg(count, int);
680 /* 20 bytes is enough to hold a 64-bit
681 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000682 This isn't enough for octal.
683 If a width is specified we need more
684 (which we allocate later). */
685 if (width < 20)
686 width = 20;
687 n += width;
688 if (abuffersize < width)
689 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000690 break;
691 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000692 {
693 /* UTF-8 */
694 unsigned char*s;
695 s = va_arg(count, unsigned char*);
696 while (*s) {
697 if (*s < 128) {
698 n++; s++;
699 } else if (*s < 0xc0) {
700 /* invalid UTF-8 */
701 n++; s++;
702 } else if (*s < 0xc0) {
703 n++;
704 s++; if(!*s)break;
705 s++;
706 } else if (*s < 0xe0) {
707 n++;
708 s++; if(!*s)break;
709 s++; if(!*s)break;
710 s++;
711 } else {
712 #ifdef Py_UNICODE_WIDE
713 n++;
714 #else
715 n+=2;
716 #endif
717 s++; if(!*s)break;
718 s++; if(!*s)break;
719 s++; if(!*s)break;
720 s++;
721 }
722 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000724 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725 case 'U':
726 {
727 PyObject *obj = va_arg(count, PyObject *);
728 assert(obj && PyUnicode_Check(obj));
729 n += PyUnicode_GET_SIZE(obj);
730 break;
731 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000732 case 'V':
733 {
734 PyObject *obj = va_arg(count, PyObject *);
735 const char *str = va_arg(count, const char *);
736 assert(obj || str);
737 assert(!obj || PyUnicode_Check(obj));
738 if (obj)
739 n += PyUnicode_GET_SIZE(obj);
740 else
741 n += strlen(str);
742 break;
743 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000744 case 'S':
745 {
746 PyObject *obj = va_arg(count, PyObject *);
747 PyObject *str;
748 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000749 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000750 if (!str)
751 goto fail;
752 n += PyUnicode_GET_SIZE(str);
753 /* Remember the str and switch to the next slot */
754 *callresult++ = str;
755 break;
756 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000757 case 'R':
758 {
759 PyObject *obj = va_arg(count, PyObject *);
760 PyObject *repr;
761 assert(obj);
762 repr = PyObject_Repr(obj);
763 if (!repr)
764 goto fail;
765 n += PyUnicode_GET_SIZE(repr);
766 /* Remember the repr and switch to the next slot */
767 *callresult++ = repr;
768 break;
769 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770 case 'p':
771 (void) va_arg(count, int);
772 /* maximum 64-bit pointer representation:
773 * 0xffffffffffffffff
774 * so 19 characters is enough.
775 * XXX I count 18 -- what's the extra for?
776 */
777 n += 19;
778 break;
779 default:
780 /* if we stumble upon an unknown
781 formatting code, copy the rest of
782 the format string to the output
783 string. (we cannot just skip the
784 code, since there's no way to know
785 what's in the argument list) */
786 n += strlen(p);
787 goto expand;
788 }
789 } else
790 n++;
791 }
792 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000793 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000794 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000795 if (!abuffer) {
796 PyErr_NoMemory();
797 goto fail;
798 }
799 realbuffer = abuffer;
800 }
801 else
802 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000803 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000804 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000805 we don't have to resize the string.
806 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 string = PyUnicode_FromUnicode(NULL, n);
808 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000809 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000810
811 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
814 for (f = format; *f; f++) {
815 if (*f == '%') {
816 const char* p = f++;
817 int longflag = 0;
818 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000819 zeropad = (*f == '0');
820 /* parse the width.precision part */
821 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000822 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000823 width = (width*10) + *f++ - '0';
824 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000825 if (*f == '.') {
826 f++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000827 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000828 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000829 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000830 /* handle the long flag, but only for %ld and %lu.
831 others can be added when necessary. */
832 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
833 longflag = 1;
834 ++f;
835 }
836 /* handle the size_t flag. */
837 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
838 size_tflag = 1;
839 ++f;
840 }
841
842 switch (*f) {
843 case 'c':
844 *s++ = va_arg(vargs, int);
845 break;
846 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000847 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000849 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000850 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000851 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000852 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 sprintf(realbuffer, fmt, va_arg(vargs, int));
854 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855 break;
856 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000858 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000860 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000861 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000863 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
864 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000865 break;
866 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000867 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
868 sprintf(realbuffer, fmt, va_arg(vargs, int));
869 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000870 break;
871 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000872 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
873 sprintf(realbuffer, fmt, va_arg(vargs, int));
874 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000875 break;
876 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000877 {
878 /* Parameter must be UTF-8 encoded.
879 In case of encoding errors, use
880 the replacement character. */
881 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000883 u = PyUnicode_DecodeUTF8(p, strlen(p),
884 "replace");
885 if (!u)
886 goto fail;
887 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
888 PyUnicode_GET_SIZE(u));
889 s += PyUnicode_GET_SIZE(u);
890 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000891 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000892 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000893 case 'U':
894 {
895 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000896 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
897 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
898 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000899 break;
900 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000901 case 'V':
902 {
903 PyObject *obj = va_arg(vargs, PyObject *);
904 const char *str = va_arg(vargs, const char *);
905 if (obj) {
906 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
907 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
908 s += size;
909 } else {
910 appendstring(str);
911 }
912 break;
913 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000914 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000915 case 'R':
916 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000917 Py_UNICODE *ucopy;
918 Py_ssize_t usize;
919 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000920 /* unused, since we already have the result */
921 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000922 ucopy = PyUnicode_AS_UNICODE(*callresult);
923 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000924 for (upos = 0; upos<usize;)
925 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000926 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000927 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000928 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000929 ++callresult;
930 break;
931 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000932 case 'p':
933 sprintf(buffer, "%p", va_arg(vargs, void*));
934 /* %p is ill-defined: ensure leading 0x. */
935 if (buffer[1] == 'X')
936 buffer[1] = 'x';
937 else if (buffer[1] != 'x') {
938 memmove(buffer+2, buffer, strlen(buffer)+1);
939 buffer[0] = '0';
940 buffer[1] = 'x';
941 }
942 appendstring(buffer);
943 break;
944 case '%':
945 *s++ = '%';
946 break;
947 default:
948 appendstring(p);
949 goto end;
950 }
951 } else
952 *s++ = *f;
953 }
954
955 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000956 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000957 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000958 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000959 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000960 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
961 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000962 fail:
963 if (callresults) {
964 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000965 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000966 Py_DECREF(*callresult2);
967 ++callresult2;
968 }
Christian Heimesb186d002008-03-18 15:15:01 +0000969 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000970 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000971 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000972 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000973 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000974}
975
976#undef appendstring
977
978PyObject *
979PyUnicode_FromFormat(const char *format, ...)
980{
981 PyObject* ret;
982 va_list vargs;
983
984#ifdef HAVE_STDARG_PROTOTYPES
985 va_start(vargs, format);
986#else
987 va_start(vargs);
988#endif
989 ret = PyUnicode_FromFormatV(format, vargs);
990 va_end(vargs);
991 return ret;
992}
993
Martin v. Löwis18e16552006-02-15 17:27:45 +0000994Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
995 wchar_t *w,
996 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000997{
998 if (unicode == NULL) {
999 PyErr_BadInternalCall();
1000 return -1;
1001 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001002
1003 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001004 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001005 size = PyUnicode_GET_SIZE(unicode) + 1;
1006
Guido van Rossumd57fd912000-03-10 22:53:23 +00001007#ifdef HAVE_USABLE_WCHAR_T
1008 memcpy(w, unicode->str, size * sizeof(wchar_t));
1009#else
1010 {
1011 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001012 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001014 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 *w++ = *u++;
1016 }
1017#endif
1018
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001019 if (size > PyUnicode_GET_SIZE(unicode))
1020 return PyUnicode_GET_SIZE(unicode);
1021 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return size;
1023}
1024
1025#endif
1026
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001027PyObject *PyUnicode_FromOrdinal(int ordinal)
1028{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001029 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001030
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001031 if (ordinal < 0 || ordinal > 0x10ffff) {
1032 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001033 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001034 return NULL;
1035 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001036
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 s[0] = 0xD800 | (ordinal >> 10);
1041 s[1] = 0xDC00 | (ordinal & 0x3FF);
1042 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001043 }
1044#endif
1045
Hye-Shik Chang40574832004-04-06 07:24:51 +00001046 s[0] = (Py_UNICODE)ordinal;
1047 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001048}
1049
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050PyObject *PyUnicode_FromObject(register PyObject *obj)
1051{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001052 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001053 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001054 if (PyUnicode_CheckExact(obj)) {
1055 Py_INCREF(obj);
1056 return obj;
1057 }
1058 if (PyUnicode_Check(obj)) {
1059 /* For a Unicode subtype that's not a Unicode object,
1060 return a true Unicode object with the same data. */
1061 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1062 PyUnicode_GET_SIZE(obj));
1063 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001064 PyErr_Format(PyExc_TypeError,
1065 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001066 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001067 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001068}
1069
1070PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1071 const char *encoding,
1072 const char *errors)
1073{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001074 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001075 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001076 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001077
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 if (obj == NULL) {
1079 PyErr_BadInternalCall();
1080 return NULL;
1081 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001082
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001083 if (PyUnicode_Check(obj)) {
1084 PyErr_SetString(PyExc_TypeError,
1085 "decoding Unicode is not supported");
1086 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001087 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088
1089 /* Coerce object */
1090 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001091 s = PyString_AS_STRING(obj);
1092 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001093 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001094 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1095 /* Overwrite the error message with something more useful in
1096 case of a TypeError. */
1097 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001098 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 "coercing to Unicode: need string or buffer, "
1100 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001101 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001102 goto onError;
1103 }
Tim Petersced69f82003-09-16 20:30:58 +00001104
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001105 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106 if (len == 0) {
1107 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109 }
Tim Petersced69f82003-09-16 20:30:58 +00001110 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001112
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001113 return v;
1114
1115 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117}
1118
1119PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001120 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 const char *encoding,
1122 const char *errors)
1123{
1124 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001125 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001126 char lower[20]; /* Enough for any encoding name we recognize */
1127 char *l;
1128 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001129
1130 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001131 encoding = PyUnicode_GetDefaultEncoding();
1132
1133 /* Convert encoding to lower case and replace '_' with '-' in order to
1134 catch e.g. UTF_8 */
1135 e = encoding;
1136 l = lower;
1137 while (*e && l < &lower[(sizeof lower) - 2]) {
1138 if (ISUPPER(*e)) {
1139 *l++ = TOLOWER(*e++);
1140 }
1141 else if (*e == '_') {
1142 *l++ = '-';
1143 e++;
1144 }
1145 else {
1146 *l++ = *e++;
1147 }
1148 }
1149 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001150
1151 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001152 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001154 else if ((strcmp(lower, "latin-1") == 0) ||
1155 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001156 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001157#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001158 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001159 return PyUnicode_DecodeMBCS(s, size, errors);
1160#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001161 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001162 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001163 else if (strcmp(lower, "utf-16") == 0)
1164 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1165 else if (strcmp(lower, "utf-32") == 0)
1166 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167
1168 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001169 buffer = NULL;
1170 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1171 goto onError;
1172 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 if (buffer == NULL)
1174 goto onError;
1175 unicode = PyCodec_Decode(buffer, encoding, errors);
1176 if (unicode == NULL)
1177 goto onError;
1178 if (!PyUnicode_Check(unicode)) {
1179 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001180 "decoder did not return an unicode object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001181 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 Py_DECREF(unicode);
1183 goto onError;
1184 }
1185 Py_DECREF(buffer);
1186 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001187
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 onError:
1189 Py_XDECREF(buffer);
1190 return NULL;
1191}
1192
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001193PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1194 const char *encoding,
1195 const char *errors)
1196{
1197 PyObject *v;
1198
1199 if (!PyUnicode_Check(unicode)) {
1200 PyErr_BadArgument();
1201 goto onError;
1202 }
1203
1204 if (encoding == NULL)
1205 encoding = PyUnicode_GetDefaultEncoding();
1206
1207 /* Decode via the codec registry */
1208 v = PyCodec_Decode(unicode, encoding, errors);
1209 if (v == NULL)
1210 goto onError;
1211 return v;
1212
1213 onError:
1214 return NULL;
1215}
1216
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001218 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *encoding,
1220 const char *errors)
1221{
1222 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 unicode = PyUnicode_FromUnicode(s, size);
1225 if (unicode == NULL)
1226 return NULL;
1227 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1228 Py_DECREF(unicode);
1229 return v;
1230}
1231
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001232PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1233 const char *encoding,
1234 const char *errors)
1235{
1236 PyObject *v;
1237
1238 if (!PyUnicode_Check(unicode)) {
1239 PyErr_BadArgument();
1240 goto onError;
1241 }
1242
1243 if (encoding == NULL)
1244 encoding = PyUnicode_GetDefaultEncoding();
1245
1246 /* Encode via the codec registry */
1247 v = PyCodec_Encode(unicode, encoding, errors);
1248 if (v == NULL)
1249 goto onError;
1250 return v;
1251
1252 onError:
1253 return NULL;
1254}
1255
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1257 const char *encoding,
1258 const char *errors)
1259{
1260 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001261
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 if (!PyUnicode_Check(unicode)) {
1263 PyErr_BadArgument();
1264 goto onError;
1265 }
Fred Drakee4315f52000-05-09 19:53:39 +00001266
Tim Petersced69f82003-09-16 20:30:58 +00001267 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001268 encoding = PyUnicode_GetDefaultEncoding();
1269
1270 /* Shortcuts for common default encodings */
1271 if (errors == NULL) {
1272 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001273 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001274 else if (strcmp(encoding, "latin-1") == 0)
1275 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001276#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1277 else if (strcmp(encoding, "mbcs") == 0)
1278 return PyUnicode_AsMBCSString(unicode);
1279#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001280 else if (strcmp(encoding, "ascii") == 0)
1281 return PyUnicode_AsASCIIString(unicode);
1282 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283
1284 /* Encode via the codec registry */
1285 v = PyCodec_Encode(unicode, encoding, errors);
1286 if (v == NULL)
1287 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001288 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 onError:
1292 return NULL;
1293}
1294
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001295PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1296 const char *errors)
1297{
1298 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001299 if (v)
1300 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001301 if (errors != NULL)
1302 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001303 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001304 PyUnicode_GET_SIZE(unicode),
1305 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001306 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001307 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001308 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001309 return v;
1310}
1311
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001312PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001313PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001314 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001315 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1316}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001317
Christian Heimes5894ba72007-11-04 11:43:14 +00001318PyObject*
1319PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1320{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001321 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1322 can be undefined. If it is case, decode using UTF-8. The following assumes
1323 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1324 bootstrapping process where the codecs aren't ready yet.
1325 */
1326 if (Py_FileSystemDefaultEncoding) {
1327#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001328 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001329 return PyUnicode_DecodeMBCS(s, size, "replace");
1330 }
1331#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001332 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001333 return PyUnicode_DecodeUTF8(s, size, "replace");
1334 }
1335#endif
1336 return PyUnicode_Decode(s, size,
1337 Py_FileSystemDefaultEncoding,
1338 "replace");
1339 }
1340 else {
1341 return PyUnicode_DecodeUTF8(s, size, "replace");
1342 }
1343}
1344
Martin v. Löwis5b222132007-06-10 09:51:05 +00001345char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001346PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001347{
Christian Heimesf3863112007-11-22 07:46:41 +00001348 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001349 if (!PyUnicode_Check(unicode)) {
1350 PyErr_BadArgument();
1351 return NULL;
1352 }
Christian Heimesf3863112007-11-22 07:46:41 +00001353 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1354 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001355 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001356 if (psize != NULL)
Christian Heimesf3863112007-11-22 07:46:41 +00001357 *psize = PyString_GET_SIZE(bytes);
1358 return PyString_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001359}
1360
1361char*
1362PyUnicode_AsString(PyObject *unicode)
1363{
1364 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001365}
1366
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1368{
1369 if (!PyUnicode_Check(unicode)) {
1370 PyErr_BadArgument();
1371 goto onError;
1372 }
1373 return PyUnicode_AS_UNICODE(unicode);
1374
1375 onError:
1376 return NULL;
1377}
1378
Martin v. Löwis18e16552006-02-15 17:27:45 +00001379Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380{
1381 if (!PyUnicode_Check(unicode)) {
1382 PyErr_BadArgument();
1383 goto onError;
1384 }
1385 return PyUnicode_GET_SIZE(unicode);
1386
1387 onError:
1388 return -1;
1389}
1390
Thomas Wouters78890102000-07-22 19:25:51 +00001391const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001392{
1393 return unicode_default_encoding;
1394}
1395
1396int PyUnicode_SetDefaultEncoding(const char *encoding)
1397{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001398 if (strcmp(encoding, unicode_default_encoding) != 0) {
1399 PyErr_Format(PyExc_ValueError,
1400 "Can only set default encoding to %s",
1401 unicode_default_encoding);
1402 return -1;
1403 }
Fred Drakee4315f52000-05-09 19:53:39 +00001404 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001405}
1406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001407/* error handling callback helper:
1408 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001409 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410 and adjust various state variables.
1411 return 0 on success, -1 on error
1412*/
1413
1414static
1415int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1416 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001417 const char **input, const char **inend, Py_ssize_t *startinpos,
1418 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001419 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422
1423 PyObject *restuple = NULL;
1424 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001425 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001426 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001427 Py_ssize_t requiredsize;
1428 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001430 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001431 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001432 int res = -1;
1433
1434 if (*errorHandler == NULL) {
1435 *errorHandler = PyCodec_LookupError(errors);
1436 if (*errorHandler == NULL)
1437 goto onError;
1438 }
1439
1440 if (*exceptionObject == NULL) {
1441 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001442 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 if (*exceptionObject == NULL)
1444 goto onError;
1445 }
1446 else {
1447 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1448 goto onError;
1449 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1450 goto onError;
1451 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1452 goto onError;
1453 }
1454
1455 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1456 if (restuple == NULL)
1457 goto onError;
1458 if (!PyTuple_Check(restuple)) {
1459 PyErr_Format(PyExc_TypeError, &argparse[4]);
1460 goto onError;
1461 }
1462 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1463 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001464
1465 /* Copy back the bytes variables, which might have been modified by the
1466 callback */
1467 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1468 if (!inputobj)
1469 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001470 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001471 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1472 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001473 *input = PyString_AS_STRING(inputobj);
1474 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001475 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001476 /* we can DECREF safely, as the exception has another reference,
1477 so the object won't go away. */
1478 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001479
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001480 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001481 newpos = insize+newpos;
1482 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001483 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001484 goto onError;
1485 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001486
1487 /* need more space? (at least enough for what we
1488 have+the replacement+the rest of the string (starting
1489 at the new input position), so we won't have to check space
1490 when there are no errors in the rest of the string) */
1491 repptr = PyUnicode_AS_UNICODE(repunicode);
1492 repsize = PyUnicode_GET_SIZE(repunicode);
1493 requiredsize = *outpos + repsize + insize-newpos;
1494 if (requiredsize > outsize) {
1495 if (requiredsize<2*outsize)
1496 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001497 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 goto onError;
1499 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1500 }
1501 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001502 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 Py_UNICODE_COPY(*outptr, repptr, repsize);
1504 *outptr += repsize;
1505 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 /* we made it! */
1508 res = 0;
1509
1510 onError:
1511 Py_XDECREF(restuple);
1512 return res;
1513}
1514
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001515/* --- UTF-7 Codec -------------------------------------------------------- */
1516
1517/* see RFC2152 for details */
1518
Tim Petersced69f82003-09-16 20:30:58 +00001519static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520char utf7_special[128] = {
1521 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1522 encoded:
1523 0 - not special
1524 1 - special
1525 2 - whitespace (optional)
1526 3 - RFC2152 Set O (optional) */
1527 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1528 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1529 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1531 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1533 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1534 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1535
1536};
1537
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001538/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1539 warnings about the comparison always being false; since
1540 utf7_special[0] is 1, we can safely make that one comparison
1541 true */
1542
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001544 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001545 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 (encodeO && (utf7_special[(c)] == 3)))
1547
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001548#define B64(n) \
1549 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1550#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001551 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001552#define UB64(c) \
1553 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1554 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001555
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001556#define ENCODE(out, ch, bits) \
1557 while (bits >= 6) { \
1558 *out++ = B64(ch >> (bits-6)); \
1559 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560 }
1561
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001562#define DECODE(out, ch, bits, surrogate) \
1563 while (bits >= 16) { \
1564 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1565 bits -= 16; \
1566 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001567 /* We have already generated an error for the high surrogate \
1568 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001569 surrogate = 0; \
1570 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001571 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001572 it in a 16-bit character */ \
1573 surrogate = 1; \
1574 errmsg = "code pairs are not supported"; \
1575 goto utf7Error; \
1576 } else { \
1577 *out++ = outCh; \
1578 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001579 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001582 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 const char *errors)
1584{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001585 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1586}
1587
1588PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1589 Py_ssize_t size,
1590 const char *errors,
1591 Py_ssize_t *consumed)
1592{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001593 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001594 Py_ssize_t startinpos;
1595 Py_ssize_t endinpos;
1596 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 const char *e;
1598 PyUnicodeObject *unicode;
1599 Py_UNICODE *p;
1600 const char *errmsg = "";
1601 int inShift = 0;
1602 unsigned int bitsleft = 0;
1603 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 int surrogate = 0;
1605 PyObject *errorHandler = NULL;
1606 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607
1608 unicode = _PyUnicode_New(size);
1609 if (!unicode)
1610 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001611 if (size == 0) {
1612 if (consumed)
1613 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001614 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001615 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001616
1617 p = unicode->str;
1618 e = s + size;
1619
1620 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 Py_UNICODE ch;
1622 restart:
1623 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624
1625 if (inShift) {
1626 if ((ch == '-') || !B64CHAR(ch)) {
1627 inShift = 0;
1628 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001629
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1631 if (bitsleft >= 6) {
1632 /* The shift sequence has a partial character in it. If
1633 bitsleft < 6 then we could just classify it as padding
1634 but that is not the case here */
1635
1636 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001637 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 }
1639 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001640 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 here so indicate the potential of a misencoded character. */
1642
1643 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1644 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1645 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001646 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 }
1648
1649 if (ch == '-') {
1650 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001651 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 inShift = 1;
1653 }
1654 } else if (SPECIAL(ch,0,0)) {
1655 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001656 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 } else {
1658 *p++ = ch;
1659 }
1660 } else {
1661 charsleft = (charsleft << 6) | UB64(ch);
1662 bitsleft += 6;
1663 s++;
1664 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1665 }
1666 }
1667 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001668 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669 s++;
1670 if (s < e && *s == '-') {
1671 s++;
1672 *p++ = '+';
1673 } else
1674 {
1675 inShift = 1;
1676 bitsleft = 0;
1677 }
1678 }
1679 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001680 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 errmsg = "unexpected special character";
1682 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001683 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001684 }
1685 else {
1686 *p++ = ch;
1687 s++;
1688 }
1689 continue;
1690 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001691 outpos = p-PyUnicode_AS_UNICODE(unicode);
1692 endinpos = s-starts;
1693 if (unicode_decode_call_errorhandler(
1694 errors, &errorHandler,
1695 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001696 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001697 (PyObject **)&unicode, &outpos, &p))
1698 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001699 }
1700
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001701 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 outpos = p-PyUnicode_AS_UNICODE(unicode);
1703 endinpos = size;
1704 if (unicode_decode_call_errorhandler(
1705 errors, &errorHandler,
1706 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001707 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001708 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001709 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001710 if (s < e)
1711 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001712 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001713 if (consumed) {
1714 if(inShift)
1715 *consumed = startinpos;
1716 else
1717 *consumed = s-starts;
1718 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001719
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001720 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001721 goto onError;
1722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 Py_XDECREF(errorHandler);
1724 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725 return (PyObject *)unicode;
1726
1727onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 Py_XDECREF(errorHandler);
1729 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 Py_DECREF(unicode);
1731 return NULL;
1732}
1733
1734
1735PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001736 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001737 int encodeSetO,
1738 int encodeWhiteSpace,
1739 const char *errors)
1740{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001741 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001743 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001745 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 unsigned int bitsleft = 0;
1747 unsigned long charsleft = 0;
1748 char * out;
1749 char * start;
1750
1751 if (size == 0)
Christian Heimesf3863112007-11-22 07:46:41 +00001752 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753
Walter Dörwald51ab4142007-05-05 14:43:36 +00001754 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 if (v == NULL)
1756 return NULL;
1757
Walter Dörwald51ab4142007-05-05 14:43:36 +00001758 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759 for (;i < size; ++i) {
1760 Py_UNICODE ch = s[i];
1761
1762 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001763 if (ch == '+') {
1764 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765 *out++ = '-';
1766 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1767 charsleft = ch;
1768 bitsleft = 16;
1769 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001770 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001771 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001772 } else {
1773 *out++ = (char) ch;
1774 }
1775 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1777 *out++ = B64(charsleft << (6-bitsleft));
1778 charsleft = 0;
1779 bitsleft = 0;
1780 /* Characters not in the BASE64 set implicitly unshift the sequence
1781 so no '-' is required, except if the character is itself a '-' */
1782 if (B64CHAR(ch) || ch == '-') {
1783 *out++ = '-';
1784 }
1785 inShift = 0;
1786 *out++ = (char) ch;
1787 } else {
1788 bitsleft += 16;
1789 charsleft = (charsleft << 16) | ch;
1790 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1791
1792 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001793 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001794 or '-' then the shift sequence will be terminated implicitly and we
1795 don't have to insert a '-'. */
1796
1797 if (bitsleft == 0) {
1798 if (i + 1 < size) {
1799 Py_UNICODE ch2 = s[i+1];
1800
1801 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001802
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 } else if (B64CHAR(ch2) || ch2 == '-') {
1804 *out++ = '-';
1805 inShift = 0;
1806 } else {
1807 inShift = 0;
1808 }
1809
1810 }
1811 else {
1812 *out++ = '-';
1813 inShift = 0;
1814 }
1815 }
Tim Petersced69f82003-09-16 20:30:58 +00001816 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001818 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 if (bitsleft) {
1820 *out++= B64(charsleft << (6-bitsleft) );
1821 *out++ = '-';
1822 }
1823
Guido van Rossum98297ee2007-11-06 21:34:58 +00001824 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1825 Py_DECREF(v);
1826 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001827}
1828
1829#undef SPECIAL
1830#undef B64
1831#undef B64CHAR
1832#undef UB64
1833#undef ENCODE
1834#undef DECODE
1835
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836/* --- UTF-8 Codec -------------------------------------------------------- */
1837
Tim Petersced69f82003-09-16 20:30:58 +00001838static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839char utf8_code_length[256] = {
1840 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1841 illegal prefix. see RFC 2279 for details */
1842 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1844 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1846 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1847 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1848 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1849 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1854 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1855 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1856 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1857 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1858};
1859
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001861 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 const char *errors)
1863{
Walter Dörwald69652032004-09-07 20:24:22 +00001864 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1865}
1866
1867PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001868 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001869 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001870 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001871{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001874 Py_ssize_t startinpos;
1875 Py_ssize_t endinpos;
1876 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 const char *e;
1878 PyUnicodeObject *unicode;
1879 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 PyObject *errorHandler = NULL;
1882 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883
1884 /* Note: size will always be longer than the resulting Unicode
1885 character count */
1886 unicode = _PyUnicode_New(size);
1887 if (!unicode)
1888 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001889 if (size == 0) {
1890 if (consumed)
1891 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001893 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894
1895 /* Unpack UTF-8 encoded data */
1896 p = unicode->str;
1897 e = s + size;
1898
1899 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001900 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901
1902 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001903 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 s++;
1905 continue;
1906 }
1907
1908 n = utf8_code_length[ch];
1909
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001910 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001911 if (consumed)
1912 break;
1913 else {
1914 errmsg = "unexpected end of data";
1915 startinpos = s-starts;
1916 endinpos = size;
1917 goto utf8Error;
1918 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920
1921 switch (n) {
1922
1923 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001924 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001925 startinpos = s-starts;
1926 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001927 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928
1929 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001930 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001931 startinpos = s-starts;
1932 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934
1935 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 if ((s[1] & 0xc0) != 0x80) {
1937 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 startinpos = s-starts;
1939 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001940 goto utf8Error;
1941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001943 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001944 startinpos = s-starts;
1945 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001946 errmsg = "illegal encoding";
1947 goto utf8Error;
1948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001950 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 break;
1952
1953 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001954 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001955 (s[2] & 0xc0) != 0x80) {
1956 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001957 startinpos = s-starts;
1958 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 goto utf8Error;
1960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001962 if (ch < 0x0800) {
1963 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001964 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001965
1966 XXX For wide builds (UCS-4) we should probably try
1967 to recombine the surrogates into a single code
1968 unit.
1969 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001970 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 startinpos = s-starts;
1972 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001973 goto utf8Error;
1974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001976 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001977 break;
1978
1979 case 4:
1980 if ((s[1] & 0xc0) != 0x80 ||
1981 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001982 (s[3] & 0xc0) != 0x80) {
1983 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001984 startinpos = s-starts;
1985 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001986 goto utf8Error;
1987 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001988 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1989 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1990 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001991 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001992 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001993 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001994 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001995 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001996 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001997 startinpos = s-starts;
1998 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001999 goto utf8Error;
2000 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002001#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002002 *p++ = (Py_UNICODE)ch;
2003#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002004 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002005
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002006 /* translate from 10000..10FFFF to 0..FFFF */
2007 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002008
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 /* high surrogate = top 10 bits added to D800 */
2010 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002011
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002012 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002013 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002014#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 break;
2016
2017 default:
2018 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002019 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 startinpos = s-starts;
2021 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002022 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 }
2024 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002025 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002026
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002027 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 outpos = p-PyUnicode_AS_UNICODE(unicode);
2029 if (unicode_decode_call_errorhandler(
2030 errors, &errorHandler,
2031 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002032 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002033 (PyObject **)&unicode, &outpos, &p))
2034 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 }
Walter Dörwald69652032004-09-07 20:24:22 +00002036 if (consumed)
2037 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038
2039 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002040 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 goto onError;
2042
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043 Py_XDECREF(errorHandler);
2044 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 return (PyObject *)unicode;
2046
2047onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 Py_XDECREF(errorHandler);
2049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 Py_DECREF(unicode);
2051 return NULL;
2052}
2053
Tim Peters602f7402002-04-27 18:03:26 +00002054/* Allocation strategy: if the string is short, convert into a stack buffer
2055 and allocate exactly as much space needed at the end. Else allocate the
2056 maximum possible needed (4 result bytes per Unicode character), and return
2057 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002058*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002059PyObject *
2060PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002061 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063{
Tim Peters602f7402002-04-27 18:03:26 +00002064#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002065
Guido van Rossum98297ee2007-11-06 21:34:58 +00002066 Py_ssize_t i; /* index into s of next input byte */
2067 PyObject *result; /* result string object */
2068 char *p; /* next free byte in output buffer */
2069 Py_ssize_t nallocated; /* number of result bytes allocated */
2070 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002071 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002072
Tim Peters602f7402002-04-27 18:03:26 +00002073 assert(s != NULL);
2074 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075
Tim Peters602f7402002-04-27 18:03:26 +00002076 if (size <= MAX_SHORT_UNICHARS) {
2077 /* Write into the stack buffer; nallocated can't overflow.
2078 * At the end, we'll allocate exactly as much heap space as it
2079 * turns out we need.
2080 */
2081 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002082 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002083 p = stackbuf;
2084 }
2085 else {
2086 /* Overallocate on the heap, and give the excess back at the end. */
2087 nallocated = size * 4;
2088 if (nallocated / 4 != size) /* overflow! */
2089 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002090 result = PyString_FromStringAndSize(NULL, nallocated);
2091 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002092 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002093 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002094 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002095
Tim Peters602f7402002-04-27 18:03:26 +00002096 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002097 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002098
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002099 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002100 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002102
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002104 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002105 *p++ = (char)(0xc0 | (ch >> 6));
2106 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002107 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002108 else {
Tim Peters602f7402002-04-27 18:03:26 +00002109 /* Encode UCS2 Unicode ordinals */
2110 if (ch < 0x10000) {
2111 /* Special case: check for high surrogate */
2112 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2113 Py_UCS4 ch2 = s[i];
2114 /* Check for low surrogate and combine the two to
2115 form a UCS4 value */
2116 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002117 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002118 i++;
2119 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 }
Tim Peters602f7402002-04-27 18:03:26 +00002121 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002122 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002123 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002124 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2125 *p++ = (char)(0x80 | (ch & 0x3f));
2126 continue;
2127 }
2128encodeUCS4:
2129 /* Encode UCS4 Unicode ordinals */
2130 *p++ = (char)(0xf0 | (ch >> 18));
2131 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2132 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2133 *p++ = (char)(0x80 | (ch & 0x3f));
2134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002136
Guido van Rossum98297ee2007-11-06 21:34:58 +00002137 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002138 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002139 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002140 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002141 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002142 }
2143 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002144 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002145 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002146 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002147 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002148 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002149 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002150
Tim Peters602f7402002-04-27 18:03:26 +00002151#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152}
2153
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2155{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 if (!PyUnicode_Check(unicode)) {
2157 PyErr_BadArgument();
2158 return NULL;
2159 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002160 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2161 PyUnicode_GET_SIZE(unicode),
2162 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163}
2164
Walter Dörwald41980ca2007-08-16 21:55:45 +00002165/* --- UTF-32 Codec ------------------------------------------------------- */
2166
2167PyObject *
2168PyUnicode_DecodeUTF32(const char *s,
2169 Py_ssize_t size,
2170 const char *errors,
2171 int *byteorder)
2172{
2173 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2174}
2175
2176PyObject *
2177PyUnicode_DecodeUTF32Stateful(const char *s,
2178 Py_ssize_t size,
2179 const char *errors,
2180 int *byteorder,
2181 Py_ssize_t *consumed)
2182{
2183 const char *starts = s;
2184 Py_ssize_t startinpos;
2185 Py_ssize_t endinpos;
2186 Py_ssize_t outpos;
2187 PyUnicodeObject *unicode;
2188 Py_UNICODE *p;
2189#ifndef Py_UNICODE_WIDE
2190 int i, pairs;
2191#else
2192 const int pairs = 0;
2193#endif
2194 const unsigned char *q, *e;
2195 int bo = 0; /* assume native ordering by default */
2196 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002197 /* Offsets from q for retrieving bytes in the right order. */
2198#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2199 int iorder[] = {0, 1, 2, 3};
2200#else
2201 int iorder[] = {3, 2, 1, 0};
2202#endif
2203 PyObject *errorHandler = NULL;
2204 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002205 /* On narrow builds we split characters outside the BMP into two
2206 codepoints => count how much extra space we need. */
2207#ifndef Py_UNICODE_WIDE
2208 for (i = pairs = 0; i < size/4; i++)
2209 if (((Py_UCS4 *)s)[i] >= 0x10000)
2210 pairs++;
2211#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002212
2213 /* This might be one to much, because of a BOM */
2214 unicode = _PyUnicode_New((size+3)/4+pairs);
2215 if (!unicode)
2216 return NULL;
2217 if (size == 0)
2218 return (PyObject *)unicode;
2219
2220 /* Unpack UTF-32 encoded data */
2221 p = unicode->str;
2222 q = (unsigned char *)s;
2223 e = q + size;
2224
2225 if (byteorder)
2226 bo = *byteorder;
2227
2228 /* Check for BOM marks (U+FEFF) in the input and adjust current
2229 byte order setting accordingly. In native mode, the leading BOM
2230 mark is skipped, in all other modes, it is copied to the output
2231 stream as-is (giving a ZWNBSP character). */
2232 if (bo == 0) {
2233 if (size >= 4) {
2234 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2235 (q[iorder[1]] << 8) | q[iorder[0]];
2236#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2237 if (bom == 0x0000FEFF) {
2238 q += 4;
2239 bo = -1;
2240 }
2241 else if (bom == 0xFFFE0000) {
2242 q += 4;
2243 bo = 1;
2244 }
2245#else
2246 if (bom == 0x0000FEFF) {
2247 q += 4;
2248 bo = 1;
2249 }
2250 else if (bom == 0xFFFE0000) {
2251 q += 4;
2252 bo = -1;
2253 }
2254#endif
2255 }
2256 }
2257
2258 if (bo == -1) {
2259 /* force LE */
2260 iorder[0] = 0;
2261 iorder[1] = 1;
2262 iorder[2] = 2;
2263 iorder[3] = 3;
2264 }
2265 else if (bo == 1) {
2266 /* force BE */
2267 iorder[0] = 3;
2268 iorder[1] = 2;
2269 iorder[2] = 1;
2270 iorder[3] = 0;
2271 }
2272
2273 while (q < e) {
2274 Py_UCS4 ch;
2275 /* remaining bytes at the end? (size should be divisible by 4) */
2276 if (e-q<4) {
2277 if (consumed)
2278 break;
2279 errmsg = "truncated data";
2280 startinpos = ((const char *)q)-starts;
2281 endinpos = ((const char *)e)-starts;
2282 goto utf32Error;
2283 /* The remaining input chars are ignored if the callback
2284 chooses to skip the input */
2285 }
2286 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2287 (q[iorder[1]] << 8) | q[iorder[0]];
2288
2289 if (ch >= 0x110000)
2290 {
2291 errmsg = "codepoint not in range(0x110000)";
2292 startinpos = ((const char *)q)-starts;
2293 endinpos = startinpos+4;
2294 goto utf32Error;
2295 }
2296#ifndef Py_UNICODE_WIDE
2297 if (ch >= 0x10000)
2298 {
2299 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2300 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2301 }
2302 else
2303#endif
2304 *p++ = ch;
2305 q += 4;
2306 continue;
2307 utf32Error:
2308 outpos = p-PyUnicode_AS_UNICODE(unicode);
2309 if (unicode_decode_call_errorhandler(
2310 errors, &errorHandler,
2311 "utf32", errmsg,
2312 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2313 (PyObject **)&unicode, &outpos, &p))
2314 goto onError;
2315 }
2316
2317 if (byteorder)
2318 *byteorder = bo;
2319
2320 if (consumed)
2321 *consumed = (const char *)q-starts;
2322
2323 /* Adjust length */
2324 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2325 goto onError;
2326
2327 Py_XDECREF(errorHandler);
2328 Py_XDECREF(exc);
2329 return (PyObject *)unicode;
2330
2331onError:
2332 Py_DECREF(unicode);
2333 Py_XDECREF(errorHandler);
2334 Py_XDECREF(exc);
2335 return NULL;
2336}
2337
2338PyObject *
2339PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2340 Py_ssize_t size,
2341 const char *errors,
2342 int byteorder)
2343{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002344 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002345 unsigned char *p;
2346#ifndef Py_UNICODE_WIDE
2347 int i, pairs;
2348#else
2349 const int pairs = 0;
2350#endif
2351 /* Offsets from p for storing byte pairs in the right order. */
2352#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2353 int iorder[] = {0, 1, 2, 3};
2354#else
2355 int iorder[] = {3, 2, 1, 0};
2356#endif
2357
2358#define STORECHAR(CH) \
2359 do { \
2360 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2361 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2362 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2363 p[iorder[0]] = (CH) & 0xff; \
2364 p += 4; \
2365 } while(0)
2366
2367 /* In narrow builds we can output surrogate pairs as one codepoint,
2368 so we need less space. */
2369#ifndef Py_UNICODE_WIDE
2370 for (i = pairs = 0; i < size-1; i++)
2371 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2372 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2373 pairs++;
2374#endif
2375 v = PyBytes_FromStringAndSize(NULL,
2376 4 * (size - pairs + (byteorder == 0)));
2377 if (v == NULL)
2378 return NULL;
2379
2380 p = (unsigned char *)PyBytes_AS_STRING(v);
2381 if (byteorder == 0)
2382 STORECHAR(0xFEFF);
2383 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002384 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002385
2386 if (byteorder == -1) {
2387 /* force LE */
2388 iorder[0] = 0;
2389 iorder[1] = 1;
2390 iorder[2] = 2;
2391 iorder[3] = 3;
2392 }
2393 else if (byteorder == 1) {
2394 /* force BE */
2395 iorder[0] = 3;
2396 iorder[1] = 2;
2397 iorder[2] = 1;
2398 iorder[3] = 0;
2399 }
2400
2401 while (size-- > 0) {
2402 Py_UCS4 ch = *s++;
2403#ifndef Py_UNICODE_WIDE
2404 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2405 Py_UCS4 ch2 = *s;
2406 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2407 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2408 s++;
2409 size--;
2410 }
2411 }
2412#endif
2413 STORECHAR(ch);
2414 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002415
2416 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002417 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002418 Py_DECREF(v);
2419 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002420#undef STORECHAR
2421}
2422
2423PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2424{
2425 if (!PyUnicode_Check(unicode)) {
2426 PyErr_BadArgument();
2427 return NULL;
2428 }
2429 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2430 PyUnicode_GET_SIZE(unicode),
2431 NULL,
2432 0);
2433}
2434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435/* --- UTF-16 Codec ------------------------------------------------------- */
2436
Tim Peters772747b2001-08-09 22:21:55 +00002437PyObject *
2438PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002439 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002440 const char *errors,
2441 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442{
Walter Dörwald69652032004-09-07 20:24:22 +00002443 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2444}
2445
2446PyObject *
2447PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002448 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002449 const char *errors,
2450 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002451 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002452{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002453 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002454 Py_ssize_t startinpos;
2455 Py_ssize_t endinpos;
2456 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457 PyUnicodeObject *unicode;
2458 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002459 const unsigned char *q, *e;
2460 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002461 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002462 /* Offsets from q for retrieving byte pairs in the right order. */
2463#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2464 int ihi = 1, ilo = 0;
2465#else
2466 int ihi = 0, ilo = 1;
2467#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 PyObject *errorHandler = NULL;
2469 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470
2471 /* Note: size will always be longer than the resulting Unicode
2472 character count */
2473 unicode = _PyUnicode_New(size);
2474 if (!unicode)
2475 return NULL;
2476 if (size == 0)
2477 return (PyObject *)unicode;
2478
2479 /* Unpack UTF-16 encoded data */
2480 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002481 q = (unsigned char *)s;
2482 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483
2484 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002485 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002487 /* Check for BOM marks (U+FEFF) in the input and adjust current
2488 byte order setting accordingly. In native mode, the leading BOM
2489 mark is skipped, in all other modes, it is copied to the output
2490 stream as-is (giving a ZWNBSP character). */
2491 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002492 if (size >= 2) {
2493 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002494#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002495 if (bom == 0xFEFF) {
2496 q += 2;
2497 bo = -1;
2498 }
2499 else if (bom == 0xFFFE) {
2500 q += 2;
2501 bo = 1;
2502 }
Tim Petersced69f82003-09-16 20:30:58 +00002503#else
Walter Dörwald69652032004-09-07 20:24:22 +00002504 if (bom == 0xFEFF) {
2505 q += 2;
2506 bo = 1;
2507 }
2508 else if (bom == 0xFFFE) {
2509 q += 2;
2510 bo = -1;
2511 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002512#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002513 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515
Tim Peters772747b2001-08-09 22:21:55 +00002516 if (bo == -1) {
2517 /* force LE */
2518 ihi = 1;
2519 ilo = 0;
2520 }
2521 else if (bo == 1) {
2522 /* force BE */
2523 ihi = 0;
2524 ilo = 1;
2525 }
2526
2527 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002528 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002529 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002530 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002531 if (consumed)
2532 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 errmsg = "truncated data";
2534 startinpos = ((const char *)q)-starts;
2535 endinpos = ((const char *)e)-starts;
2536 goto utf16Error;
2537 /* The remaining input chars are ignored if the callback
2538 chooses to skip the input */
2539 }
2540 ch = (q[ihi] << 8) | q[ilo];
2541
Tim Peters772747b2001-08-09 22:21:55 +00002542 q += 2;
2543
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 if (ch < 0xD800 || ch > 0xDFFF) {
2545 *p++ = ch;
2546 continue;
2547 }
2548
2549 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002550 if (q >= e) {
2551 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002552 startinpos = (((const char *)q)-2)-starts;
2553 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002554 goto utf16Error;
2555 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002556 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002557 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2558 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002559 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002560#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002561 *p++ = ch;
2562 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002563#else
2564 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002565#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002566 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002567 }
2568 else {
2569 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 startinpos = (((const char *)q)-4)-starts;
2571 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002572 goto utf16Error;
2573 }
2574
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002576 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 startinpos = (((const char *)q)-2)-starts;
2578 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002579 /* Fall through to report the error */
2580
2581 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002582 outpos = p-PyUnicode_AS_UNICODE(unicode);
2583 if (unicode_decode_call_errorhandler(
2584 errors, &errorHandler,
2585 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002586 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002588 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 }
2590
2591 if (byteorder)
2592 *byteorder = bo;
2593
Walter Dörwald69652032004-09-07 20:24:22 +00002594 if (consumed)
2595 *consumed = (const char *)q-starts;
2596
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002598 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 goto onError;
2600
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002601 Py_XDECREF(errorHandler);
2602 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 return (PyObject *)unicode;
2604
2605onError:
2606 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 Py_XDECREF(errorHandler);
2608 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 return NULL;
2610}
2611
Tim Peters772747b2001-08-09 22:21:55 +00002612PyObject *
2613PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002614 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002615 const char *errors,
2616 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002618 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002619 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002620#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002621 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002622#else
2623 const int pairs = 0;
2624#endif
Tim Peters772747b2001-08-09 22:21:55 +00002625 /* Offsets from p for storing byte pairs in the right order. */
2626#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2627 int ihi = 1, ilo = 0;
2628#else
2629 int ihi = 0, ilo = 1;
2630#endif
2631
2632#define STORECHAR(CH) \
2633 do { \
2634 p[ihi] = ((CH) >> 8) & 0xff; \
2635 p[ilo] = (CH) & 0xff; \
2636 p += 2; \
2637 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002639#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002640 for (i = pairs = 0; i < size; i++)
2641 if (s[i] >= 0x10000)
2642 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002643#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002644 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002645 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 if (v == NULL)
2647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648
Walter Dörwald3cc34522007-05-04 10:48:27 +00002649 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002651 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002652 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002653 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002654
2655 if (byteorder == -1) {
2656 /* force LE */
2657 ihi = 1;
2658 ilo = 0;
2659 }
2660 else if (byteorder == 1) {
2661 /* force BE */
2662 ihi = 0;
2663 ilo = 1;
2664 }
2665
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002666 while (size-- > 0) {
2667 Py_UNICODE ch = *s++;
2668 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002669#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002670 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002671 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2672 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002674#endif
Tim Peters772747b2001-08-09 22:21:55 +00002675 STORECHAR(ch);
2676 if (ch2)
2677 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002678 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002679
2680 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002681 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002682 Py_DECREF(v);
2683 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002684#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685}
2686
2687PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2688{
2689 if (!PyUnicode_Check(unicode)) {
2690 PyErr_BadArgument();
2691 return NULL;
2692 }
2693 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2694 PyUnicode_GET_SIZE(unicode),
2695 NULL,
2696 0);
2697}
2698
2699/* --- Unicode Escape Codec ----------------------------------------------- */
2700
Fredrik Lundh06d12682001-01-24 07:59:11 +00002701static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002702
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002704 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 const char *errors)
2706{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002708 Py_ssize_t startinpos;
2709 Py_ssize_t endinpos;
2710 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002715 char* message;
2716 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 PyObject *errorHandler = NULL;
2718 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002719
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 /* Escaped strings will always be longer than the resulting
2721 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 length after conversion to the true value.
2723 (but if the error callback returns a long replacement string
2724 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 v = _PyUnicode_New(size);
2726 if (v == NULL)
2727 goto onError;
2728 if (size == 0)
2729 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 while (s < end) {
2735 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002736 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738
2739 /* Non-escape characters are interpreted as Unicode ordinals */
2740 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002741 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 continue;
2743 }
2744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 /* \ - Escapes */
2747 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002748 c = *s++;
2749 if (s > end)
2750 c = '\0'; /* Invalid after \ */
2751 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752
2753 /* \x escapes */
2754 case '\n': break;
2755 case '\\': *p++ = '\\'; break;
2756 case '\'': *p++ = '\''; break;
2757 case '\"': *p++ = '\"'; break;
2758 case 'b': *p++ = '\b'; break;
2759 case 'f': *p++ = '\014'; break; /* FF */
2760 case 't': *p++ = '\t'; break;
2761 case 'n': *p++ = '\n'; break;
2762 case 'r': *p++ = '\r'; break;
2763 case 'v': *p++ = '\013'; break; /* VT */
2764 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2765
2766 /* \OOO (octal) escapes */
2767 case '0': case '1': case '2': case '3':
2768 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002769 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002770 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002771 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002772 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002773 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002775 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 break;
2777
Fredrik Lundhccc74732001-02-18 22:13:49 +00002778 /* hex escapes */
2779 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002781 digits = 2;
2782 message = "truncated \\xXX escape";
2783 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784
Fredrik Lundhccc74732001-02-18 22:13:49 +00002785 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002787 digits = 4;
2788 message = "truncated \\uXXXX escape";
2789 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790
Fredrik Lundhccc74732001-02-18 22:13:49 +00002791 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002792 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 digits = 8;
2794 message = "truncated \\UXXXXXXXX escape";
2795 hexescape:
2796 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002797 outpos = p-PyUnicode_AS_UNICODE(v);
2798 if (s+digits>end) {
2799 endinpos = size;
2800 if (unicode_decode_call_errorhandler(
2801 errors, &errorHandler,
2802 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002803 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002804 (PyObject **)&v, &outpos, &p))
2805 goto onError;
2806 goto nextByte;
2807 }
2808 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002809 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002810 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 endinpos = (s+i+1)-starts;
2812 if (unicode_decode_call_errorhandler(
2813 errors, &errorHandler,
2814 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002815 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002817 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002819 }
2820 chr = (chr<<4) & ~0xF;
2821 if (c >= '0' && c <= '9')
2822 chr += c - '0';
2823 else if (c >= 'a' && c <= 'f')
2824 chr += 10 + c - 'a';
2825 else
2826 chr += 10 + c - 'A';
2827 }
2828 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002829 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 /* _decoding_error will have already written into the
2831 target buffer. */
2832 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002833 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002834 /* when we get here, chr is a 32-bit unicode character */
2835 if (chr <= 0xffff)
2836 /* UCS-2 character */
2837 *p++ = (Py_UNICODE) chr;
2838 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002839 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002840 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002841#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002842 *p++ = chr;
2843#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002844 chr -= 0x10000L;
2845 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002846 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002847#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002848 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 endinpos = s-starts;
2850 outpos = p-PyUnicode_AS_UNICODE(v);
2851 if (unicode_decode_call_errorhandler(
2852 errors, &errorHandler,
2853 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002854 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002856 goto onError;
2857 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002858 break;
2859
2860 /* \N{name} */
2861 case 'N':
2862 message = "malformed \\N character escape";
2863 if (ucnhash_CAPI == NULL) {
2864 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002865 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002866 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002867 if (m == NULL)
2868 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002869 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002870 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002871 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002872 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002873 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002874 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002875 if (ucnhash_CAPI == NULL)
2876 goto ucnhashError;
2877 }
2878 if (*s == '{') {
2879 const char *start = s+1;
2880 /* look for the closing brace */
2881 while (*s != '}' && s < end)
2882 s++;
2883 if (s > start && s < end && *s == '}') {
2884 /* found a name. look it up in the unicode database */
2885 message = "unknown Unicode character name";
2886 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002887 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 goto store;
2889 }
2890 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002891 endinpos = s-starts;
2892 outpos = p-PyUnicode_AS_UNICODE(v);
2893 if (unicode_decode_call_errorhandler(
2894 errors, &errorHandler,
2895 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002896 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002897 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002898 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002899 break;
2900
2901 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002902 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 message = "\\ at end of string";
2904 s--;
2905 endinpos = s-starts;
2906 outpos = p-PyUnicode_AS_UNICODE(v);
2907 if (unicode_decode_call_errorhandler(
2908 errors, &errorHandler,
2909 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002910 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002912 goto onError;
2913 }
2914 else {
2915 *p++ = '\\';
2916 *p++ = (unsigned char)s[-1];
2917 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002918 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 nextByte:
2921 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002923 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002924 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002925 Py_XDECREF(errorHandler);
2926 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002928
Fredrik Lundhccc74732001-02-18 22:13:49 +00002929ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002930 PyErr_SetString(
2931 PyExc_UnicodeError,
2932 "\\N escapes not supported (can't load unicodedata module)"
2933 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002934 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002935 Py_XDECREF(errorHandler);
2936 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002937 return NULL;
2938
Fredrik Lundhccc74732001-02-18 22:13:49 +00002939onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 return NULL;
2944}
2945
2946/* Return a Unicode-Escape string version of the Unicode object.
2947
2948 If quotes is true, the string is enclosed in u"" or u'' quotes as
2949 appropriate.
2950
2951*/
2952
Thomas Wouters477c8d52006-05-27 19:21:47 +00002953Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2954 Py_ssize_t size,
2955 Py_UNICODE ch)
2956{
2957 /* like wcschr, but doesn't stop at NULL characters */
2958
2959 while (size-- > 0) {
2960 if (*s == ch)
2961 return s;
2962 s++;
2963 }
2964
2965 return NULL;
2966}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002967
Walter Dörwald79e913e2007-05-12 11:08:06 +00002968static const char *hexdigits = "0123456789abcdef";
2969
2970PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2971 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002973 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975
Thomas Wouters89f507f2006-12-13 04:49:30 +00002976 /* XXX(nnorwitz): rather than over-allocating, it would be
2977 better to choose a different scheme. Perhaps scan the
2978 first N-chars of the string and allocate based on that size.
2979 */
2980 /* Initial allocation is based on the longest-possible unichr
2981 escape.
2982
2983 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2984 unichr, so in this case it's the longest unichr escape. In
2985 narrow (UTF-16) builds this is five chars per source unichr
2986 since there are two unichrs in the surrogate pair, so in narrow
2987 (UTF-16) builds it's not the longest unichr escape.
2988
2989 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2990 so in the narrow (UTF-16) build case it's the longest unichr
2991 escape.
2992 */
2993
Walter Dörwald79e913e2007-05-12 11:08:06 +00002994 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002995#ifdef Py_UNICODE_WIDE
2996 + 10*size
2997#else
2998 + 6*size
2999#endif
3000 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 if (repr == NULL)
3002 return NULL;
3003
Walter Dörwald79e913e2007-05-12 11:08:06 +00003004 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 while (size-- > 0) {
3007 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003008
Walter Dörwald79e913e2007-05-12 11:08:06 +00003009 /* Escape backslashes */
3010 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 *p++ = '\\';
3012 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003013 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003014 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003015
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003016#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003017 /* Map 21-bit characters to '\U00xxxxxx' */
3018 else if (ch >= 0x10000) {
3019 *p++ = '\\';
3020 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003021 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3022 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3023 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3024 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3025 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3026 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3027 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3028 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003029 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003030 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003031#else
3032 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003033 else if (ch >= 0xD800 && ch < 0xDC00) {
3034 Py_UNICODE ch2;
3035 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003036
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003037 ch2 = *s++;
3038 size--;
3039 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3040 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3041 *p++ = '\\';
3042 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003043 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3044 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3045 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3046 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3047 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3048 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3049 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3050 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003051 continue;
3052 }
3053 /* Fall through: isolated surrogates are copied as-is */
3054 s--;
3055 size++;
3056 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003057#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003058
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003060 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 *p++ = '\\';
3062 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003063 *p++ = hexdigits[(ch >> 12) & 0x000F];
3064 *p++ = hexdigits[(ch >> 8) & 0x000F];
3065 *p++ = hexdigits[(ch >> 4) & 0x000F];
3066 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003068
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003069 /* Map special whitespace to '\t', \n', '\r' */
3070 else if (ch == '\t') {
3071 *p++ = '\\';
3072 *p++ = 't';
3073 }
3074 else if (ch == '\n') {
3075 *p++ = '\\';
3076 *p++ = 'n';
3077 }
3078 else if (ch == '\r') {
3079 *p++ = '\\';
3080 *p++ = 'r';
3081 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003082
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003083 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003084 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003086 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003087 *p++ = hexdigits[(ch >> 4) & 0x000F];
3088 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003089 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003090
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 /* Copy everything else as-is */
3092 else
3093 *p++ = (char) ch;
3094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095
Guido van Rossum98297ee2007-11-06 21:34:58 +00003096 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3097 p - PyBytes_AS_STRING(repr));
3098 Py_DECREF(repr);
3099 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100}
3101
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3103{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003104 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 if (!PyUnicode_Check(unicode)) {
3106 PyErr_BadArgument();
3107 return NULL;
3108 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003109 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3110 PyUnicode_GET_SIZE(unicode));
3111
3112 if (!s)
3113 return NULL;
3114 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3115 PyBytes_GET_SIZE(s));
3116 Py_DECREF(s);
3117 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118}
3119
3120/* --- Raw Unicode Escape Codec ------------------------------------------- */
3121
3122PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003123 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 const char *errors)
3125{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003127 Py_ssize_t startinpos;
3128 Py_ssize_t endinpos;
3129 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003131 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 const char *end;
3133 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003134 PyObject *errorHandler = NULL;
3135 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003136
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 /* Escaped strings will always be longer than the resulting
3138 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003139 length after conversion to the true value. (But decoding error
3140 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 v = _PyUnicode_New(size);
3142 if (v == NULL)
3143 goto onError;
3144 if (size == 0)
3145 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003146 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 end = s + size;
3148 while (s < end) {
3149 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003150 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003152 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153
3154 /* Non-escape characters are interpreted as Unicode ordinals */
3155 if (*s != '\\') {
3156 *p++ = (unsigned char)*s++;
3157 continue;
3158 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160
3161 /* \u-escapes are only interpreted iff the number of leading
3162 backslashes if odd */
3163 bs = s;
3164 for (;s < end;) {
3165 if (*s != '\\')
3166 break;
3167 *p++ = (unsigned char)*s++;
3168 }
3169 if (((s - bs) & 1) == 0 ||
3170 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003171 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 continue;
3173 }
3174 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003175 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 s++;
3177
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003178 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003180 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003182 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003183 endinpos = s-starts;
3184 if (unicode_decode_call_errorhandler(
3185 errors, &errorHandler,
3186 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003187 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003188 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 }
3192 x = (x<<4) & ~0xF;
3193 if (c >= '0' && c <= '9')
3194 x += c - '0';
3195 else if (c >= 'a' && c <= 'f')
3196 x += 10 + c - 'a';
3197 else
3198 x += 10 + c - 'A';
3199 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003200#ifndef Py_UNICODE_WIDE
3201 if (x > 0x10000) {
3202 if (unicode_decode_call_errorhandler(
3203 errors, &errorHandler,
3204 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003205 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003206 (PyObject **)&v, &outpos, &p))
3207 goto onError;
3208 }
3209#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003210 *p++ = x;
3211 nextByte:
3212 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003214 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003215 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 Py_XDECREF(errorHandler);
3217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003219
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 onError:
3221 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003222 Py_XDECREF(errorHandler);
3223 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 return NULL;
3225}
3226
3227PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003228 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003230 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 char *p;
3232 char *q;
3233
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003234#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003235 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003236#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003237 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003238#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 if (repr == NULL)
3240 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003241 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003242 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243
Walter Dörwald711005d2007-05-12 12:03:26 +00003244 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 while (size-- > 0) {
3246 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003247#ifdef Py_UNICODE_WIDE
3248 /* Map 32-bit characters to '\Uxxxxxxxx' */
3249 if (ch >= 0x10000) {
3250 *p++ = '\\';
3251 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003252 *p++ = hexdigits[(ch >> 28) & 0xf];
3253 *p++ = hexdigits[(ch >> 24) & 0xf];
3254 *p++ = hexdigits[(ch >> 20) & 0xf];
3255 *p++ = hexdigits[(ch >> 16) & 0xf];
3256 *p++ = hexdigits[(ch >> 12) & 0xf];
3257 *p++ = hexdigits[(ch >> 8) & 0xf];
3258 *p++ = hexdigits[(ch >> 4) & 0xf];
3259 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003260 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003261 else
3262#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 /* Map 16-bit characters to '\uxxxx' */
3264 if (ch >= 256) {
3265 *p++ = '\\';
3266 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003267 *p++ = hexdigits[(ch >> 12) & 0xf];
3268 *p++ = hexdigits[(ch >> 8) & 0xf];
3269 *p++ = hexdigits[(ch >> 4) & 0xf];
3270 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 }
3272 /* Copy everything else as-is */
3273 else
3274 *p++ = (char) ch;
3275 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003276 size = p - q;
3277
3278 done:
3279 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3280 Py_DECREF(repr);
3281 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282}
3283
3284PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3285{
Walter Dörwald711005d2007-05-12 12:03:26 +00003286 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003288 PyErr_BadArgument();
3289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003291 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3292 PyUnicode_GET_SIZE(unicode));
3293
3294 if (!s)
3295 return NULL;
3296 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3297 PyBytes_GET_SIZE(s));
3298 Py_DECREF(s);
3299 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300}
3301
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003302/* --- Unicode Internal Codec ------------------------------------------- */
3303
3304PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003305 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003306 const char *errors)
3307{
3308 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003309 Py_ssize_t startinpos;
3310 Py_ssize_t endinpos;
3311 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003312 PyUnicodeObject *v;
3313 Py_UNICODE *p;
3314 const char *end;
3315 const char *reason;
3316 PyObject *errorHandler = NULL;
3317 PyObject *exc = NULL;
3318
Neal Norwitzd43069c2006-01-08 01:12:10 +00003319#ifdef Py_UNICODE_WIDE
3320 Py_UNICODE unimax = PyUnicode_GetMax();
3321#endif
3322
Thomas Wouters89f507f2006-12-13 04:49:30 +00003323 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003324 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3325 if (v == NULL)
3326 goto onError;
3327 if (PyUnicode_GetSize((PyObject *)v) == 0)
3328 return (PyObject *)v;
3329 p = PyUnicode_AS_UNICODE(v);
3330 end = s + size;
3331
3332 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003333 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003334 /* We have to sanity check the raw data, otherwise doom looms for
3335 some malformed UCS-4 data. */
3336 if (
3337 #ifdef Py_UNICODE_WIDE
3338 *p > unimax || *p < 0 ||
3339 #endif
3340 end-s < Py_UNICODE_SIZE
3341 )
3342 {
3343 startinpos = s - starts;
3344 if (end-s < Py_UNICODE_SIZE) {
3345 endinpos = end-starts;
3346 reason = "truncated input";
3347 }
3348 else {
3349 endinpos = s - starts + Py_UNICODE_SIZE;
3350 reason = "illegal code point (> 0x10FFFF)";
3351 }
3352 outpos = p - PyUnicode_AS_UNICODE(v);
3353 if (unicode_decode_call_errorhandler(
3354 errors, &errorHandler,
3355 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003356 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003357 (PyObject **)&v, &outpos, &p)) {
3358 goto onError;
3359 }
3360 }
3361 else {
3362 p++;
3363 s += Py_UNICODE_SIZE;
3364 }
3365 }
3366
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003367 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003368 goto onError;
3369 Py_XDECREF(errorHandler);
3370 Py_XDECREF(exc);
3371 return (PyObject *)v;
3372
3373 onError:
3374 Py_XDECREF(v);
3375 Py_XDECREF(errorHandler);
3376 Py_XDECREF(exc);
3377 return NULL;
3378}
3379
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380/* --- Latin-1 Codec ------------------------------------------------------ */
3381
3382PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003383 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 const char *errors)
3385{
3386 PyUnicodeObject *v;
3387 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003388
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003390 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003391 Py_UNICODE r = *(unsigned char*)s;
3392 return PyUnicode_FromUnicode(&r, 1);
3393 }
3394
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 v = _PyUnicode_New(size);
3396 if (v == NULL)
3397 goto onError;
3398 if (size == 0)
3399 return (PyObject *)v;
3400 p = PyUnicode_AS_UNICODE(v);
3401 while (size-- > 0)
3402 *p++ = (unsigned char)*s++;
3403 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003404
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 onError:
3406 Py_XDECREF(v);
3407 return NULL;
3408}
3409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410/* create or adjust a UnicodeEncodeError */
3411static void make_encode_exception(PyObject **exceptionObject,
3412 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003413 const Py_UNICODE *unicode, Py_ssize_t size,
3414 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 if (*exceptionObject == NULL) {
3418 *exceptionObject = PyUnicodeEncodeError_Create(
3419 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 }
3421 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3423 goto onError;
3424 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3425 goto onError;
3426 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3427 goto onError;
3428 return;
3429 onError:
3430 Py_DECREF(*exceptionObject);
3431 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 }
3433}
3434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435/* raises a UnicodeEncodeError */
3436static void raise_encode_exception(PyObject **exceptionObject,
3437 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003438 const Py_UNICODE *unicode, Py_ssize_t size,
3439 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 const char *reason)
3441{
3442 make_encode_exception(exceptionObject,
3443 encoding, unicode, size, startpos, endpos, reason);
3444 if (*exceptionObject != NULL)
3445 PyCodec_StrictErrors(*exceptionObject);
3446}
3447
3448/* error handling callback helper:
3449 build arguments, call the callback and check the arguments,
3450 put the result into newpos and return the replacement string, which
3451 has to be freed by the caller */
3452static PyObject *unicode_encode_call_errorhandler(const char *errors,
3453 PyObject **errorHandler,
3454 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003455 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3456 Py_ssize_t startpos, Py_ssize_t endpos,
3457 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003459 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460
3461 PyObject *restuple;
3462 PyObject *resunicode;
3463
3464 if (*errorHandler == NULL) {
3465 *errorHandler = PyCodec_LookupError(errors);
3466 if (*errorHandler == NULL)
3467 return NULL;
3468 }
3469
3470 make_encode_exception(exceptionObject,
3471 encoding, unicode, size, startpos, endpos, reason);
3472 if (*exceptionObject == NULL)
3473 return NULL;
3474
3475 restuple = PyObject_CallFunctionObjArgs(
3476 *errorHandler, *exceptionObject, NULL);
3477 if (restuple == NULL)
3478 return NULL;
3479 if (!PyTuple_Check(restuple)) {
3480 PyErr_Format(PyExc_TypeError, &argparse[4]);
3481 Py_DECREF(restuple);
3482 return NULL;
3483 }
3484 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3485 &resunicode, newpos)) {
3486 Py_DECREF(restuple);
3487 return NULL;
3488 }
3489 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003490 *newpos = size+*newpos;
3491 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003492 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003493 Py_DECREF(restuple);
3494 return NULL;
3495 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 Py_INCREF(resunicode);
3497 Py_DECREF(restuple);
3498 return resunicode;
3499}
3500
3501static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003502 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 const char *errors,
3504 int limit)
3505{
3506 /* output object */
3507 PyObject *res;
3508 /* pointers to the beginning and end+1 of input */
3509 const Py_UNICODE *startp = p;
3510 const Py_UNICODE *endp = p + size;
3511 /* pointer to the beginning of the unencodable characters */
3512 /* const Py_UNICODE *badp = NULL; */
3513 /* pointer into the output */
3514 char *str;
3515 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003516 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003517 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3518 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 PyObject *errorHandler = NULL;
3520 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003521 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 /* the following variable is used for caching string comparisons
3523 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3524 int known_errorHandler = -1;
3525
3526 /* allocate enough for a simple encoding without
3527 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003528 if (size == 0)
3529 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003530 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003532 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003533 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 ressize = size;
3535
3536 while (p<endp) {
3537 Py_UNICODE c = *p;
3538
3539 /* can we encode this? */
3540 if (c<limit) {
3541 /* no overflow check, because we know that the space is enough */
3542 *str++ = (char)c;
3543 ++p;
3544 }
3545 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003546 Py_ssize_t unicodepos = p-startp;
3547 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003549 Py_ssize_t repsize;
3550 Py_ssize_t newpos;
3551 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 Py_UNICODE *uni2;
3553 /* startpos for collecting unencodable chars */
3554 const Py_UNICODE *collstart = p;
3555 const Py_UNICODE *collend = p;
3556 /* find all unecodable characters */
3557 while ((collend < endp) && ((*collend)>=limit))
3558 ++collend;
3559 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3560 if (known_errorHandler==-1) {
3561 if ((errors==NULL) || (!strcmp(errors, "strict")))
3562 known_errorHandler = 1;
3563 else if (!strcmp(errors, "replace"))
3564 known_errorHandler = 2;
3565 else if (!strcmp(errors, "ignore"))
3566 known_errorHandler = 3;
3567 else if (!strcmp(errors, "xmlcharrefreplace"))
3568 known_errorHandler = 4;
3569 else
3570 known_errorHandler = 0;
3571 }
3572 switch (known_errorHandler) {
3573 case 1: /* strict */
3574 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3575 goto onError;
3576 case 2: /* replace */
3577 while (collstart++<collend)
3578 *str++ = '?'; /* fall through */
3579 case 3: /* ignore */
3580 p = collend;
3581 break;
3582 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003583 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 /* determine replacement size (temporarily (mis)uses p) */
3585 for (p = collstart, repsize = 0; p < collend; ++p) {
3586 if (*p<10)
3587 repsize += 2+1+1;
3588 else if (*p<100)
3589 repsize += 2+2+1;
3590 else if (*p<1000)
3591 repsize += 2+3+1;
3592 else if (*p<10000)
3593 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003594#ifndef Py_UNICODE_WIDE
3595 else
3596 repsize += 2+5+1;
3597#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 else if (*p<100000)
3599 repsize += 2+5+1;
3600 else if (*p<1000000)
3601 repsize += 2+6+1;
3602 else
3603 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003604#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 }
3606 requiredsize = respos+repsize+(endp-collend);
3607 if (requiredsize > ressize) {
3608 if (requiredsize<2*ressize)
3609 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003610 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003612 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 ressize = requiredsize;
3614 }
3615 /* generate replacement (temporarily (mis)uses p) */
3616 for (p = collstart; p < collend; ++p) {
3617 str += sprintf(str, "&#%d;", (int)*p);
3618 }
3619 p = collend;
3620 break;
3621 default:
3622 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3623 encoding, reason, startp, size, &exc,
3624 collstart-startp, collend-startp, &newpos);
3625 if (repunicode == NULL)
3626 goto onError;
3627 /* need more space? (at least enough for what we
3628 have+the replacement+the rest of the string, so
3629 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003630 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 repsize = PyUnicode_GET_SIZE(repunicode);
3632 requiredsize = respos+repsize+(endp-collend);
3633 if (requiredsize > ressize) {
3634 if (requiredsize<2*ressize)
3635 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003636 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 Py_DECREF(repunicode);
3638 goto onError;
3639 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003640 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 ressize = requiredsize;
3642 }
3643 /* check if there is anything unencodable in the replacement
3644 and copy it to the output */
3645 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3646 c = *uni2;
3647 if (c >= limit) {
3648 raise_encode_exception(&exc, encoding, startp, size,
3649 unicodepos, unicodepos+1, reason);
3650 Py_DECREF(repunicode);
3651 goto onError;
3652 }
3653 *str = (char)c;
3654 }
3655 p = startp + newpos;
3656 Py_DECREF(repunicode);
3657 }
3658 }
3659 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003660 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3661 str - PyBytes_AS_STRING(res));
3662 onError:
3663 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 Py_XDECREF(errorHandler);
3665 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003666 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667}
3668
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003670 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 const char *errors)
3672{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674}
3675
3676PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3677{
3678 if (!PyUnicode_Check(unicode)) {
3679 PyErr_BadArgument();
3680 return NULL;
3681 }
3682 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3683 PyUnicode_GET_SIZE(unicode),
3684 NULL);
3685}
3686
3687/* --- 7-bit ASCII Codec -------------------------------------------------- */
3688
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003690 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 const char *errors)
3692{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 PyUnicodeObject *v;
3695 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003696 Py_ssize_t startinpos;
3697 Py_ssize_t endinpos;
3698 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 const char *e;
3700 PyObject *errorHandler = NULL;
3701 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003704 if (size == 1 && *(unsigned char*)s < 128) {
3705 Py_UNICODE r = *(unsigned char*)s;
3706 return PyUnicode_FromUnicode(&r, 1);
3707 }
Tim Petersced69f82003-09-16 20:30:58 +00003708
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 v = _PyUnicode_New(size);
3710 if (v == NULL)
3711 goto onError;
3712 if (size == 0)
3713 return (PyObject *)v;
3714 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715 e = s + size;
3716 while (s < e) {
3717 register unsigned char c = (unsigned char)*s;
3718 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 ++s;
3721 }
3722 else {
3723 startinpos = s-starts;
3724 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003725 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 if (unicode_decode_call_errorhandler(
3727 errors, &errorHandler,
3728 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003729 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003734 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003735 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003736 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 Py_XDECREF(errorHandler);
3738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003740
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 onError:
3742 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 Py_XDECREF(errorHandler);
3744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 return NULL;
3746}
3747
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003749 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 const char *errors)
3751{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753}
3754
3755PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3756{
3757 if (!PyUnicode_Check(unicode)) {
3758 PyErr_BadArgument();
3759 return NULL;
3760 }
3761 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3762 PyUnicode_GET_SIZE(unicode),
3763 NULL);
3764}
3765
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003766#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003767
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003768/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003769
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003770#if SIZEOF_INT < SIZEOF_SSIZE_T
3771#define NEED_RETRY
3772#endif
3773
3774/* XXX This code is limited to "true" double-byte encodings, as
3775 a) it assumes an incomplete character consists of a single byte, and
3776 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3777 encodings, see IsDBCSLeadByteEx documentation. */
3778
3779static int is_dbcs_lead_byte(const char *s, int offset)
3780{
3781 const char *curr = s + offset;
3782
3783 if (IsDBCSLeadByte(*curr)) {
3784 const char *prev = CharPrev(s, curr);
3785 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3786 }
3787 return 0;
3788}
3789
3790/*
3791 * Decode MBCS string into unicode object. If 'final' is set, converts
3792 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3793 */
3794static int decode_mbcs(PyUnicodeObject **v,
3795 const char *s, /* MBCS string */
3796 int size, /* sizeof MBCS string */
3797 int final)
3798{
3799 Py_UNICODE *p;
3800 Py_ssize_t n = 0;
3801 int usize = 0;
3802
3803 assert(size >= 0);
3804
3805 /* Skip trailing lead-byte unless 'final' is set */
3806 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3807 --size;
3808
3809 /* First get the size of the result */
3810 if (size > 0) {
3811 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3812 if (usize == 0) {
3813 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3814 return -1;
3815 }
3816 }
3817
3818 if (*v == NULL) {
3819 /* Create unicode object */
3820 *v = _PyUnicode_New(usize);
3821 if (*v == NULL)
3822 return -1;
3823 }
3824 else {
3825 /* Extend unicode object */
3826 n = PyUnicode_GET_SIZE(*v);
3827 if (_PyUnicode_Resize(v, n + usize) < 0)
3828 return -1;
3829 }
3830
3831 /* Do the conversion */
3832 if (size > 0) {
3833 p = PyUnicode_AS_UNICODE(*v) + n;
3834 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3835 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3836 return -1;
3837 }
3838 }
3839
3840 return size;
3841}
3842
3843PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3844 Py_ssize_t size,
3845 const char *errors,
3846 Py_ssize_t *consumed)
3847{
3848 PyUnicodeObject *v = NULL;
3849 int done;
3850
3851 if (consumed)
3852 *consumed = 0;
3853
3854#ifdef NEED_RETRY
3855 retry:
3856 if (size > INT_MAX)
3857 done = decode_mbcs(&v, s, INT_MAX, 0);
3858 else
3859#endif
3860 done = decode_mbcs(&v, s, (int)size, !consumed);
3861
3862 if (done < 0) {
3863 Py_XDECREF(v);
3864 return NULL;
3865 }
3866
3867 if (consumed)
3868 *consumed += done;
3869
3870#ifdef NEED_RETRY
3871 if (size > INT_MAX) {
3872 s += done;
3873 size -= done;
3874 goto retry;
3875 }
3876#endif
3877
3878 return (PyObject *)v;
3879}
3880
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003881PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003882 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003883 const char *errors)
3884{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003885 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3886}
3887
3888/*
3889 * Convert unicode into string object (MBCS).
3890 * Returns 0 if succeed, -1 otherwise.
3891 */
3892static int encode_mbcs(PyObject **repr,
3893 const Py_UNICODE *p, /* unicode */
3894 int size) /* size of unicode */
3895{
3896 int mbcssize = 0;
3897 Py_ssize_t n = 0;
3898
3899 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003900
3901 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003902 if (size > 0) {
3903 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3904 if (mbcssize == 0) {
3905 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3906 return -1;
3907 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003908 }
3909
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003910 if (*repr == NULL) {
3911 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003912 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003913 if (*repr == NULL)
3914 return -1;
3915 }
3916 else {
3917 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003918 n = PyString_Size(*repr);
3919 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003920 return -1;
3921 }
3922
3923 /* Do the conversion */
3924 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003925 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003926 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3927 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3928 return -1;
3929 }
3930 }
3931
3932 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003933}
3934
3935PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003936 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003937 const char *errors)
3938{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003939 PyObject *repr = NULL;
3940 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003941
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003942#ifdef NEED_RETRY
3943 retry:
3944 if (size > INT_MAX)
3945 ret = encode_mbcs(&repr, p, INT_MAX);
3946 else
3947#endif
3948 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003949
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003950 if (ret < 0) {
3951 Py_XDECREF(repr);
3952 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003953 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003954
3955#ifdef NEED_RETRY
3956 if (size > INT_MAX) {
3957 p += INT_MAX;
3958 size -= INT_MAX;
3959 goto retry;
3960 }
3961#endif
3962
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003963 return repr;
3964}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003965
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003966PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3967{
3968 if (!PyUnicode_Check(unicode)) {
3969 PyErr_BadArgument();
3970 return NULL;
3971 }
3972 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3973 PyUnicode_GET_SIZE(unicode),
3974 NULL);
3975}
3976
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003977#undef NEED_RETRY
3978
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003979#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003980
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981/* --- Character Mapping Codec -------------------------------------------- */
3982
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 PyObject *mapping,
3986 const char *errors)
3987{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003989 Py_ssize_t startinpos;
3990 Py_ssize_t endinpos;
3991 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 PyUnicodeObject *v;
3994 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003995 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996 PyObject *errorHandler = NULL;
3997 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003998 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003999 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004000
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 /* Default to Latin-1 */
4002 if (mapping == NULL)
4003 return PyUnicode_DecodeLatin1(s, size, errors);
4004
4005 v = _PyUnicode_New(size);
4006 if (v == NULL)
4007 goto onError;
4008 if (size == 0)
4009 return (PyObject *)v;
4010 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004012 if (PyUnicode_CheckExact(mapping)) {
4013 mapstring = PyUnicode_AS_UNICODE(mapping);
4014 maplen = PyUnicode_GET_SIZE(mapping);
4015 while (s < e) {
4016 unsigned char ch = *s;
4017 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004019 if (ch < maplen)
4020 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004022 if (x == 0xfffe) {
4023 /* undefined mapping */
4024 outpos = p-PyUnicode_AS_UNICODE(v);
4025 startinpos = s-starts;
4026 endinpos = startinpos+1;
4027 if (unicode_decode_call_errorhandler(
4028 errors, &errorHandler,
4029 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004030 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004031 (PyObject **)&v, &outpos, &p)) {
4032 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004033 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004034 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004035 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004036 *p++ = x;
4037 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004039 }
4040 else {
4041 while (s < e) {
4042 unsigned char ch = *s;
4043 PyObject *w, *x;
4044
4045 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004046 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004047 if (w == NULL)
4048 goto onError;
4049 x = PyObject_GetItem(mapping, w);
4050 Py_DECREF(w);
4051 if (x == NULL) {
4052 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4053 /* No mapping found means: mapping is undefined. */
4054 PyErr_Clear();
4055 x = Py_None;
4056 Py_INCREF(x);
4057 } else
4058 goto onError;
4059 }
4060
4061 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004062 if (PyLong_Check(x)) {
4063 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004064 if (value < 0 || value > 65535) {
4065 PyErr_SetString(PyExc_TypeError,
4066 "character mapping must be in range(65536)");
4067 Py_DECREF(x);
4068 goto onError;
4069 }
4070 *p++ = (Py_UNICODE)value;
4071 }
4072 else if (x == Py_None) {
4073 /* undefined mapping */
4074 outpos = p-PyUnicode_AS_UNICODE(v);
4075 startinpos = s-starts;
4076 endinpos = startinpos+1;
4077 if (unicode_decode_call_errorhandler(
4078 errors, &errorHandler,
4079 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004080 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004081 (PyObject **)&v, &outpos, &p)) {
4082 Py_DECREF(x);
4083 goto onError;
4084 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004085 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004086 continue;
4087 }
4088 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004089 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004090
4091 if (targetsize == 1)
4092 /* 1-1 mapping */
4093 *p++ = *PyUnicode_AS_UNICODE(x);
4094
4095 else if (targetsize > 1) {
4096 /* 1-n mapping */
4097 if (targetsize > extrachars) {
4098 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004099 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4100 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004101 (targetsize << 2);
4102 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004103 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004104 if (_PyUnicode_Resize(&v,
4105 PyUnicode_GET_SIZE(v) + needed) < 0) {
4106 Py_DECREF(x);
4107 goto onError;
4108 }
4109 p = PyUnicode_AS_UNICODE(v) + oldpos;
4110 }
4111 Py_UNICODE_COPY(p,
4112 PyUnicode_AS_UNICODE(x),
4113 targetsize);
4114 p += targetsize;
4115 extrachars -= targetsize;
4116 }
4117 /* 1-0 mapping: skip the character */
4118 }
4119 else {
4120 /* wrong return value */
4121 PyErr_SetString(PyExc_TypeError,
4122 "character mapping must return integer, None or unicode");
4123 Py_DECREF(x);
4124 goto onError;
4125 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004127 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 }
4130 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004131 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004133 Py_XDECREF(errorHandler);
4134 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004136
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 Py_XDECREF(errorHandler);
4139 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 Py_XDECREF(v);
4141 return NULL;
4142}
4143
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004144/* Charmap encoding: the lookup table */
4145
4146struct encoding_map{
4147 PyObject_HEAD
4148 unsigned char level1[32];
4149 int count2, count3;
4150 unsigned char level23[1];
4151};
4152
4153static PyObject*
4154encoding_map_size(PyObject *obj, PyObject* args)
4155{
4156 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004157 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004158 128*map->count3);
4159}
4160
4161static PyMethodDef encoding_map_methods[] = {
4162 {"size", encoding_map_size, METH_NOARGS,
4163 PyDoc_STR("Return the size (in bytes) of this object") },
4164 { 0 }
4165};
4166
4167static void
4168encoding_map_dealloc(PyObject* o)
4169{
4170 PyObject_FREE(o);
4171}
4172
4173static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004174 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004175 "EncodingMap", /*tp_name*/
4176 sizeof(struct encoding_map), /*tp_basicsize*/
4177 0, /*tp_itemsize*/
4178 /* methods */
4179 encoding_map_dealloc, /*tp_dealloc*/
4180 0, /*tp_print*/
4181 0, /*tp_getattr*/
4182 0, /*tp_setattr*/
4183 0, /*tp_compare*/
4184 0, /*tp_repr*/
4185 0, /*tp_as_number*/
4186 0, /*tp_as_sequence*/
4187 0, /*tp_as_mapping*/
4188 0, /*tp_hash*/
4189 0, /*tp_call*/
4190 0, /*tp_str*/
4191 0, /*tp_getattro*/
4192 0, /*tp_setattro*/
4193 0, /*tp_as_buffer*/
4194 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4195 0, /*tp_doc*/
4196 0, /*tp_traverse*/
4197 0, /*tp_clear*/
4198 0, /*tp_richcompare*/
4199 0, /*tp_weaklistoffset*/
4200 0, /*tp_iter*/
4201 0, /*tp_iternext*/
4202 encoding_map_methods, /*tp_methods*/
4203 0, /*tp_members*/
4204 0, /*tp_getset*/
4205 0, /*tp_base*/
4206 0, /*tp_dict*/
4207 0, /*tp_descr_get*/
4208 0, /*tp_descr_set*/
4209 0, /*tp_dictoffset*/
4210 0, /*tp_init*/
4211 0, /*tp_alloc*/
4212 0, /*tp_new*/
4213 0, /*tp_free*/
4214 0, /*tp_is_gc*/
4215};
4216
4217PyObject*
4218PyUnicode_BuildEncodingMap(PyObject* string)
4219{
4220 Py_UNICODE *decode;
4221 PyObject *result;
4222 struct encoding_map *mresult;
4223 int i;
4224 int need_dict = 0;
4225 unsigned char level1[32];
4226 unsigned char level2[512];
4227 unsigned char *mlevel1, *mlevel2, *mlevel3;
4228 int count2 = 0, count3 = 0;
4229
4230 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4231 PyErr_BadArgument();
4232 return NULL;
4233 }
4234 decode = PyUnicode_AS_UNICODE(string);
4235 memset(level1, 0xFF, sizeof level1);
4236 memset(level2, 0xFF, sizeof level2);
4237
4238 /* If there isn't a one-to-one mapping of NULL to \0,
4239 or if there are non-BMP characters, we need to use
4240 a mapping dictionary. */
4241 if (decode[0] != 0)
4242 need_dict = 1;
4243 for (i = 1; i < 256; i++) {
4244 int l1, l2;
4245 if (decode[i] == 0
4246 #ifdef Py_UNICODE_WIDE
4247 || decode[i] > 0xFFFF
4248 #endif
4249 ) {
4250 need_dict = 1;
4251 break;
4252 }
4253 if (decode[i] == 0xFFFE)
4254 /* unmapped character */
4255 continue;
4256 l1 = decode[i] >> 11;
4257 l2 = decode[i] >> 7;
4258 if (level1[l1] == 0xFF)
4259 level1[l1] = count2++;
4260 if (level2[l2] == 0xFF)
4261 level2[l2] = count3++;
4262 }
4263
4264 if (count2 >= 0xFF || count3 >= 0xFF)
4265 need_dict = 1;
4266
4267 if (need_dict) {
4268 PyObject *result = PyDict_New();
4269 PyObject *key, *value;
4270 if (!result)
4271 return NULL;
4272 for (i = 0; i < 256; i++) {
4273 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004274 key = PyLong_FromLong(decode[i]);
4275 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004276 if (!key || !value)
4277 goto failed1;
4278 if (PyDict_SetItem(result, key, value) == -1)
4279 goto failed1;
4280 Py_DECREF(key);
4281 Py_DECREF(value);
4282 }
4283 return result;
4284 failed1:
4285 Py_XDECREF(key);
4286 Py_XDECREF(value);
4287 Py_DECREF(result);
4288 return NULL;
4289 }
4290
4291 /* Create a three-level trie */
4292 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4293 16*count2 + 128*count3 - 1);
4294 if (!result)
4295 return PyErr_NoMemory();
4296 PyObject_Init(result, &EncodingMapType);
4297 mresult = (struct encoding_map*)result;
4298 mresult->count2 = count2;
4299 mresult->count3 = count3;
4300 mlevel1 = mresult->level1;
4301 mlevel2 = mresult->level23;
4302 mlevel3 = mresult->level23 + 16*count2;
4303 memcpy(mlevel1, level1, 32);
4304 memset(mlevel2, 0xFF, 16*count2);
4305 memset(mlevel3, 0, 128*count3);
4306 count3 = 0;
4307 for (i = 1; i < 256; i++) {
4308 int o1, o2, o3, i2, i3;
4309 if (decode[i] == 0xFFFE)
4310 /* unmapped character */
4311 continue;
4312 o1 = decode[i]>>11;
4313 o2 = (decode[i]>>7) & 0xF;
4314 i2 = 16*mlevel1[o1] + o2;
4315 if (mlevel2[i2] == 0xFF)
4316 mlevel2[i2] = count3++;
4317 o3 = decode[i] & 0x7F;
4318 i3 = 128*mlevel2[i2] + o3;
4319 mlevel3[i3] = i;
4320 }
4321 return result;
4322}
4323
4324static int
4325encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4326{
4327 struct encoding_map *map = (struct encoding_map*)mapping;
4328 int l1 = c>>11;
4329 int l2 = (c>>7) & 0xF;
4330 int l3 = c & 0x7F;
4331 int i;
4332
4333#ifdef Py_UNICODE_WIDE
4334 if (c > 0xFFFF) {
4335 return -1;
4336 }
4337#endif
4338 if (c == 0)
4339 return 0;
4340 /* level 1*/
4341 i = map->level1[l1];
4342 if (i == 0xFF) {
4343 return -1;
4344 }
4345 /* level 2*/
4346 i = map->level23[16*i+l2];
4347 if (i == 0xFF) {
4348 return -1;
4349 }
4350 /* level 3 */
4351 i = map->level23[16*map->count2 + 128*i + l3];
4352 if (i == 0) {
4353 return -1;
4354 }
4355 return i;
4356}
4357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358/* Lookup the character ch in the mapping. If the character
4359 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004360 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004361static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362{
Christian Heimes217cfd12007-12-02 14:31:20 +00004363 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 PyObject *x;
4365
4366 if (w == NULL)
4367 return NULL;
4368 x = PyObject_GetItem(mapping, w);
4369 Py_DECREF(w);
4370 if (x == NULL) {
4371 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4372 /* No mapping found means: mapping is undefined. */
4373 PyErr_Clear();
4374 x = Py_None;
4375 Py_INCREF(x);
4376 return x;
4377 } else
4378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004380 else if (x == Py_None)
4381 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004382 else if (PyLong_Check(x)) {
4383 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 if (value < 0 || value > 255) {
4385 PyErr_SetString(PyExc_TypeError,
4386 "character mapping must be in range(256)");
4387 Py_DECREF(x);
4388 return NULL;
4389 }
4390 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392 else if (PyString_Check(x))
4393 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004396 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004397 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004398 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 Py_DECREF(x);
4400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401 }
4402}
4403
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004404static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004405charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004406{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004407 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004408 /* exponentially overallocate to minimize reallocations */
4409 if (requiredsize < 2*outsize)
4410 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004411 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004412 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004413 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004414}
4415
4416typedef enum charmapencode_result {
4417 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4418}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004420 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 space is available. Return a new reference to the object that
4422 was put in the output buffer, or Py_None, if the mapping was undefined
4423 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004424 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004426charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004427 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004429 PyObject *rep;
4430 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004431 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432
Christian Heimes90aa7642007-12-19 02:45:37 +00004433 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004434 int res = encoding_map_lookup(c, mapping);
4435 Py_ssize_t requiredsize = *outpos+1;
4436 if (res == -1)
4437 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004438 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004439 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004440 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004441 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004442 outstart[(*outpos)++] = (char)res;
4443 return enc_SUCCESS;
4444 }
4445
4446 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004448 return enc_EXCEPTION;
4449 else if (rep==Py_None) {
4450 Py_DECREF(rep);
4451 return enc_FAILED;
4452 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004453 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004454 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004455 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004456 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004458 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004460 outstart = PyString_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004461 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 }
4463 else {
4464 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004465 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4466 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004467 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004468 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004470 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004472 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 memcpy(outstart + *outpos, repchars, repsize);
4474 *outpos += repsize;
4475 }
4476 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004477 Py_DECREF(rep);
4478 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479}
4480
4481/* handle an error in PyUnicode_EncodeCharmap
4482 Return 0 on success, -1 on error */
4483static
4484int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004485 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004487 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004488 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489{
4490 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004491 Py_ssize_t repsize;
4492 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 Py_UNICODE *uni2;
4494 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004495 Py_ssize_t collstartpos = *inpos;
4496 Py_ssize_t collendpos = *inpos+1;
4497 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 char *encoding = "charmap";
4499 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004500 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 /* find all unencodable characters */
4503 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004504 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004505 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004506 int res = encoding_map_lookup(p[collendpos], mapping);
4507 if (res != -1)
4508 break;
4509 ++collendpos;
4510 continue;
4511 }
4512
4513 rep = charmapencode_lookup(p[collendpos], mapping);
4514 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004516 else if (rep!=Py_None) {
4517 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 break;
4519 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004520 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 ++collendpos;
4522 }
4523 /* cache callback name lookup
4524 * (if not done yet, i.e. it's the first error) */
4525 if (*known_errorHandler==-1) {
4526 if ((errors==NULL) || (!strcmp(errors, "strict")))
4527 *known_errorHandler = 1;
4528 else if (!strcmp(errors, "replace"))
4529 *known_errorHandler = 2;
4530 else if (!strcmp(errors, "ignore"))
4531 *known_errorHandler = 3;
4532 else if (!strcmp(errors, "xmlcharrefreplace"))
4533 *known_errorHandler = 4;
4534 else
4535 *known_errorHandler = 0;
4536 }
4537 switch (*known_errorHandler) {
4538 case 1: /* strict */
4539 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4540 return -1;
4541 case 2: /* replace */
4542 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4543 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004544 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 return -1;
4546 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004547 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4549 return -1;
4550 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 }
4552 /* fall through */
4553 case 3: /* ignore */
4554 *inpos = collendpos;
4555 break;
4556 case 4: /* xmlcharrefreplace */
4557 /* generate replacement (temporarily (mis)uses p) */
4558 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4559 char buffer[2+29+1+1];
4560 char *cp;
4561 sprintf(buffer, "&#%d;", (int)p[collpos]);
4562 for (cp = buffer; *cp; ++cp) {
4563 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004564 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004566 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4568 return -1;
4569 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 }
4571 }
4572 *inpos = collendpos;
4573 break;
4574 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004575 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 encoding, reason, p, size, exceptionObject,
4577 collstartpos, collendpos, &newpos);
4578 if (repunicode == NULL)
4579 return -1;
4580 /* generate replacement */
4581 repsize = PyUnicode_GET_SIZE(repunicode);
4582 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4583 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004584 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 return -1;
4586 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004587 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4590 return -1;
4591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 }
4593 *inpos = newpos;
4594 Py_DECREF(repunicode);
4595 }
4596 return 0;
4597}
4598
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004600 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 PyObject *mapping,
4602 const char *errors)
4603{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 /* output object */
4605 PyObject *res = NULL;
4606 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004607 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004609 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 PyObject *errorHandler = NULL;
4611 PyObject *exc = NULL;
4612 /* the following variable is used for caching string comparisons
4613 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4614 * 3=ignore, 4=xmlcharrefreplace */
4615 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616
4617 /* Default to Latin-1 */
4618 if (mapping == NULL)
4619 return PyUnicode_EncodeLatin1(p, size, errors);
4620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 /* allocate enough for a simple encoding without
4622 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004623 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 if (res == NULL)
4625 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004626 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 while (inpos<size) {
4630 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004631 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004632 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004634 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004635 if (charmap_encoding_error(p, size, &inpos, mapping,
4636 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004637 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004638 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004639 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 else
4643 /* done with this character => adjust input position */
4644 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004648 if (respos<PyString_GET_SIZE(res))
4649 _PyString_Resize(&res, respos);
4650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 Py_XDECREF(exc);
4652 Py_XDECREF(errorHandler);
4653 return res;
4654
4655 onError:
4656 Py_XDECREF(res);
4657 Py_XDECREF(exc);
4658 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 return NULL;
4660}
4661
4662PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4663 PyObject *mapping)
4664{
4665 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4666 PyErr_BadArgument();
4667 return NULL;
4668 }
4669 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4670 PyUnicode_GET_SIZE(unicode),
4671 mapping,
4672 NULL);
4673}
4674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675/* create or adjust a UnicodeTranslateError */
4676static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004677 const Py_UNICODE *unicode, Py_ssize_t size,
4678 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681 if (*exceptionObject == NULL) {
4682 *exceptionObject = PyUnicodeTranslateError_Create(
4683 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 }
4685 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4687 goto onError;
4688 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4689 goto onError;
4690 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4691 goto onError;
4692 return;
4693 onError:
4694 Py_DECREF(*exceptionObject);
4695 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 }
4697}
4698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699/* raises a UnicodeTranslateError */
4700static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004701 const Py_UNICODE *unicode, Py_ssize_t size,
4702 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 const char *reason)
4704{
4705 make_translate_exception(exceptionObject,
4706 unicode, size, startpos, endpos, reason);
4707 if (*exceptionObject != NULL)
4708 PyCodec_StrictErrors(*exceptionObject);
4709}
4710
4711/* error handling callback helper:
4712 build arguments, call the callback and check the arguments,
4713 put the result into newpos and return the replacement string, which
4714 has to be freed by the caller */
4715static PyObject *unicode_translate_call_errorhandler(const char *errors,
4716 PyObject **errorHandler,
4717 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004718 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4719 Py_ssize_t startpos, Py_ssize_t endpos,
4720 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004722 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004724 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004725 PyObject *restuple;
4726 PyObject *resunicode;
4727
4728 if (*errorHandler == NULL) {
4729 *errorHandler = PyCodec_LookupError(errors);
4730 if (*errorHandler == NULL)
4731 return NULL;
4732 }
4733
4734 make_translate_exception(exceptionObject,
4735 unicode, size, startpos, endpos, reason);
4736 if (*exceptionObject == NULL)
4737 return NULL;
4738
4739 restuple = PyObject_CallFunctionObjArgs(
4740 *errorHandler, *exceptionObject, NULL);
4741 if (restuple == NULL)
4742 return NULL;
4743 if (!PyTuple_Check(restuple)) {
4744 PyErr_Format(PyExc_TypeError, &argparse[4]);
4745 Py_DECREF(restuple);
4746 return NULL;
4747 }
4748 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004749 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750 Py_DECREF(restuple);
4751 return NULL;
4752 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004753 if (i_newpos<0)
4754 *newpos = size+i_newpos;
4755 else
4756 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004757 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004758 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004759 Py_DECREF(restuple);
4760 return NULL;
4761 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 Py_INCREF(resunicode);
4763 Py_DECREF(restuple);
4764 return resunicode;
4765}
4766
4767/* Lookup the character ch in the mapping and put the result in result,
4768 which must be decrefed by the caller.
4769 Return 0 on success, -1 on error */
4770static
4771int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4772{
Christian Heimes217cfd12007-12-02 14:31:20 +00004773 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 PyObject *x;
4775
4776 if (w == NULL)
4777 return -1;
4778 x = PyObject_GetItem(mapping, w);
4779 Py_DECREF(w);
4780 if (x == NULL) {
4781 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4782 /* No mapping found means: use 1:1 mapping. */
4783 PyErr_Clear();
4784 *result = NULL;
4785 return 0;
4786 } else
4787 return -1;
4788 }
4789 else if (x == Py_None) {
4790 *result = x;
4791 return 0;
4792 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004793 else if (PyLong_Check(x)) {
4794 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 long max = PyUnicode_GetMax();
4796 if (value < 0 || value > max) {
4797 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004798 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 Py_DECREF(x);
4800 return -1;
4801 }
4802 *result = x;
4803 return 0;
4804 }
4805 else if (PyUnicode_Check(x)) {
4806 *result = x;
4807 return 0;
4808 }
4809 else {
4810 /* wrong return value */
4811 PyErr_SetString(PyExc_TypeError,
4812 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004813 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 return -1;
4815 }
4816}
4817/* ensure that *outobj is at least requiredsize characters long,
4818if not reallocate and adjust various state variables.
4819Return 0 on success, -1 on error */
4820static
Walter Dörwald4894c302003-10-24 14:25:28 +00004821int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004822 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004824 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004825 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004827 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004829 if (requiredsize < 2 * oldsize)
4830 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004831 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 return -1;
4833 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 }
4835 return 0;
4836}
4837/* lookup the character, put the result in the output string and adjust
4838 various state variables. Return a new reference to the object that
4839 was put in the output buffer in *result, or Py_None, if the mapping was
4840 undefined (in which case no character was written).
4841 The called must decref result.
4842 Return 0 on success, -1 on error. */
4843static
Walter Dörwald4894c302003-10-24 14:25:28 +00004844int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004845 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004846 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847{
Walter Dörwald4894c302003-10-24 14:25:28 +00004848 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 return -1;
4850 if (*res==NULL) {
4851 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004852 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 }
4854 else if (*res==Py_None)
4855 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00004856 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00004858 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004859 }
4860 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004861 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004862 if (repsize==1) {
4863 /* no overflow check, because we know that the space is enough */
4864 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4865 }
4866 else if (repsize!=0) {
4867 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004868 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004869 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004870 repsize - 1;
4871 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 return -1;
4873 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4874 *outp += repsize;
4875 }
4876 }
4877 else
4878 return -1;
4879 return 0;
4880}
4881
4882PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004883 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 PyObject *mapping,
4885 const char *errors)
4886{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 /* output object */
4888 PyObject *res = NULL;
4889 /* pointers to the beginning and end+1 of input */
4890 const Py_UNICODE *startp = p;
4891 const Py_UNICODE *endp = p + size;
4892 /* pointer into the output */
4893 Py_UNICODE *str;
4894 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004895 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 char *reason = "character maps to <undefined>";
4897 PyObject *errorHandler = NULL;
4898 PyObject *exc = NULL;
4899 /* the following variable is used for caching string comparisons
4900 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4901 * 3=ignore, 4=xmlcharrefreplace */
4902 int known_errorHandler = -1;
4903
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 if (mapping == NULL) {
4905 PyErr_BadArgument();
4906 return NULL;
4907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908
4909 /* allocate enough for a simple 1:1 translation without
4910 replacements, if we need more, we'll resize */
4911 res = PyUnicode_FromUnicode(NULL, size);
4912 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004913 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915 return res;
4916 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918 while (p<endp) {
4919 /* try to encode it */
4920 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004921 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 goto onError;
4924 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004925 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 if (x!=Py_None) /* it worked => adjust input pointer */
4927 ++p;
4928 else { /* untranslatable character */
4929 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004930 Py_ssize_t repsize;
4931 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 Py_UNICODE *uni2;
4933 /* startpos for collecting untranslatable chars */
4934 const Py_UNICODE *collstart = p;
4935 const Py_UNICODE *collend = p+1;
4936 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 /* find all untranslatable characters */
4939 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004940 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004941 goto onError;
4942 Py_XDECREF(x);
4943 if (x!=Py_None)
4944 break;
4945 ++collend;
4946 }
4947 /* cache callback name lookup
4948 * (if not done yet, i.e. it's the first error) */
4949 if (known_errorHandler==-1) {
4950 if ((errors==NULL) || (!strcmp(errors, "strict")))
4951 known_errorHandler = 1;
4952 else if (!strcmp(errors, "replace"))
4953 known_errorHandler = 2;
4954 else if (!strcmp(errors, "ignore"))
4955 known_errorHandler = 3;
4956 else if (!strcmp(errors, "xmlcharrefreplace"))
4957 known_errorHandler = 4;
4958 else
4959 known_errorHandler = 0;
4960 }
4961 switch (known_errorHandler) {
4962 case 1: /* strict */
4963 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4964 goto onError;
4965 case 2: /* replace */
4966 /* No need to check for space, this is a 1:1 replacement */
4967 for (coll = collstart; coll<collend; ++coll)
4968 *str++ = '?';
4969 /* fall through */
4970 case 3: /* ignore */
4971 p = collend;
4972 break;
4973 case 4: /* xmlcharrefreplace */
4974 /* generate replacement (temporarily (mis)uses p) */
4975 for (p = collstart; p < collend; ++p) {
4976 char buffer[2+29+1+1];
4977 char *cp;
4978 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004979 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4981 goto onError;
4982 for (cp = buffer; *cp; ++cp)
4983 *str++ = *cp;
4984 }
4985 p = collend;
4986 break;
4987 default:
4988 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4989 reason, startp, size, &exc,
4990 collstart-startp, collend-startp, &newpos);
4991 if (repunicode == NULL)
4992 goto onError;
4993 /* generate replacement */
4994 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004995 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4997 Py_DECREF(repunicode);
4998 goto onError;
4999 }
5000 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5001 *str++ = *uni2;
5002 p = startp + newpos;
5003 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 }
5005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 /* Resize if we allocated to much */
5008 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005009 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005010 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005011 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005012 }
5013 Py_XDECREF(exc);
5014 Py_XDECREF(errorHandler);
5015 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 onError:
5018 Py_XDECREF(res);
5019 Py_XDECREF(exc);
5020 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021 return NULL;
5022}
5023
5024PyObject *PyUnicode_Translate(PyObject *str,
5025 PyObject *mapping,
5026 const char *errors)
5027{
5028 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005029
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 str = PyUnicode_FromObject(str);
5031 if (str == NULL)
5032 goto onError;
5033 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5034 PyUnicode_GET_SIZE(str),
5035 mapping,
5036 errors);
5037 Py_DECREF(str);
5038 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005039
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 onError:
5041 Py_XDECREF(str);
5042 return NULL;
5043}
Tim Petersced69f82003-09-16 20:30:58 +00005044
Guido van Rossum9e896b32000-04-05 20:11:21 +00005045/* --- Decimal Encoder ---------------------------------------------------- */
5046
5047int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005048 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005049 char *output,
5050 const char *errors)
5051{
5052 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053 PyObject *errorHandler = NULL;
5054 PyObject *exc = NULL;
5055 const char *encoding = "decimal";
5056 const char *reason = "invalid decimal Unicode string";
5057 /* the following variable is used for caching string comparisons
5058 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5059 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005060
5061 if (output == NULL) {
5062 PyErr_BadArgument();
5063 return -1;
5064 }
5065
5066 p = s;
5067 end = s + length;
5068 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005070 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005072 Py_ssize_t repsize;
5073 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074 Py_UNICODE *uni2;
5075 Py_UNICODE *collstart;
5076 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005077
Guido van Rossum9e896b32000-04-05 20:11:21 +00005078 if (Py_UNICODE_ISSPACE(ch)) {
5079 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005081 continue;
5082 }
5083 decimal = Py_UNICODE_TODECIMAL(ch);
5084 if (decimal >= 0) {
5085 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005087 continue;
5088 }
Guido van Rossumba477042000-04-06 18:18:10 +00005089 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005090 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005092 continue;
5093 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005094 /* All other characters are considered unencodable */
5095 collstart = p;
5096 collend = p+1;
5097 while (collend < end) {
5098 if ((0 < *collend && *collend < 256) ||
5099 !Py_UNICODE_ISSPACE(*collend) ||
5100 Py_UNICODE_TODECIMAL(*collend))
5101 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005102 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 /* cache callback name lookup
5104 * (if not done yet, i.e. it's the first error) */
5105 if (known_errorHandler==-1) {
5106 if ((errors==NULL) || (!strcmp(errors, "strict")))
5107 known_errorHandler = 1;
5108 else if (!strcmp(errors, "replace"))
5109 known_errorHandler = 2;
5110 else if (!strcmp(errors, "ignore"))
5111 known_errorHandler = 3;
5112 else if (!strcmp(errors, "xmlcharrefreplace"))
5113 known_errorHandler = 4;
5114 else
5115 known_errorHandler = 0;
5116 }
5117 switch (known_errorHandler) {
5118 case 1: /* strict */
5119 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5120 goto onError;
5121 case 2: /* replace */
5122 for (p = collstart; p < collend; ++p)
5123 *output++ = '?';
5124 /* fall through */
5125 case 3: /* ignore */
5126 p = collend;
5127 break;
5128 case 4: /* xmlcharrefreplace */
5129 /* generate replacement (temporarily (mis)uses p) */
5130 for (p = collstart; p < collend; ++p)
5131 output += sprintf(output, "&#%d;", (int)*p);
5132 p = collend;
5133 break;
5134 default:
5135 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5136 encoding, reason, s, length, &exc,
5137 collstart-s, collend-s, &newpos);
5138 if (repunicode == NULL)
5139 goto onError;
5140 /* generate replacement */
5141 repsize = PyUnicode_GET_SIZE(repunicode);
5142 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5143 Py_UNICODE ch = *uni2;
5144 if (Py_UNICODE_ISSPACE(ch))
5145 *output++ = ' ';
5146 else {
5147 decimal = Py_UNICODE_TODECIMAL(ch);
5148 if (decimal >= 0)
5149 *output++ = '0' + decimal;
5150 else if (0 < ch && ch < 256)
5151 *output++ = (char)ch;
5152 else {
5153 Py_DECREF(repunicode);
5154 raise_encode_exception(&exc, encoding,
5155 s, length, collstart-s, collend-s, reason);
5156 goto onError;
5157 }
5158 }
5159 }
5160 p = s + newpos;
5161 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005162 }
5163 }
5164 /* 0-terminate the output string */
5165 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005166 Py_XDECREF(exc);
5167 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005168 return 0;
5169
5170 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171 Py_XDECREF(exc);
5172 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005173 return -1;
5174}
5175
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176/* --- Helpers ------------------------------------------------------------ */
5177
Eric Smith8c663262007-08-25 02:26:07 +00005178#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005179#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005180#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005181/* Include _ParseTupleFinds from find.h */
5182#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005183#include "stringlib/find.h"
5184#include "stringlib/partition.h"
5185
5186/* helper macro to fixup start/end slice values */
5187#define FIX_START_END(obj) \
5188 if (start < 0) \
5189 start += (obj)->length; \
5190 if (start < 0) \
5191 start = 0; \
5192 if (end > (obj)->length) \
5193 end = (obj)->length; \
5194 if (end < 0) \
5195 end += (obj)->length; \
5196 if (end < 0) \
5197 end = 0;
5198
Martin v. Löwis18e16552006-02-15 17:27:45 +00005199Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005200 PyObject *substr,
5201 Py_ssize_t start,
5202 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005205 PyUnicodeObject* str_obj;
5206 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005207
Thomas Wouters477c8d52006-05-27 19:21:47 +00005208 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5209 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005211 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5212 if (!sub_obj) {
5213 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 return -1;
5215 }
Tim Petersced69f82003-09-16 20:30:58 +00005216
Thomas Wouters477c8d52006-05-27 19:21:47 +00005217 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005218
Thomas Wouters477c8d52006-05-27 19:21:47 +00005219 result = stringlib_count(
5220 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5221 );
5222
5223 Py_DECREF(sub_obj);
5224 Py_DECREF(str_obj);
5225
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 return result;
5227}
5228
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005230 PyObject *sub,
5231 Py_ssize_t start,
5232 Py_ssize_t end,
5233 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005236
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005238 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005239 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005240 sub = PyUnicode_FromObject(sub);
5241 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005242 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005243 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 }
Tim Petersced69f82003-09-16 20:30:58 +00005245
Thomas Wouters477c8d52006-05-27 19:21:47 +00005246 if (direction > 0)
5247 result = stringlib_find_slice(
5248 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5249 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5250 start, end
5251 );
5252 else
5253 result = stringlib_rfind_slice(
5254 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5255 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5256 start, end
5257 );
5258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005260 Py_DECREF(sub);
5261
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 return result;
5263}
5264
Tim Petersced69f82003-09-16 20:30:58 +00005265static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266int tailmatch(PyUnicodeObject *self,
5267 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005268 Py_ssize_t start,
5269 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 int direction)
5271{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 if (substring->length == 0)
5273 return 1;
5274
Thomas Wouters477c8d52006-05-27 19:21:47 +00005275 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276
5277 end -= substring->length;
5278 if (end < start)
5279 return 0;
5280
5281 if (direction > 0) {
5282 if (Py_UNICODE_MATCH(self, end, substring))
5283 return 1;
5284 } else {
5285 if (Py_UNICODE_MATCH(self, start, substring))
5286 return 1;
5287 }
5288
5289 return 0;
5290}
5291
Martin v. Löwis18e16552006-02-15 17:27:45 +00005292Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005294 Py_ssize_t start,
5295 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 int direction)
5297{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005298 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005299
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 str = PyUnicode_FromObject(str);
5301 if (str == NULL)
5302 return -1;
5303 substr = PyUnicode_FromObject(substr);
5304 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005305 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 return -1;
5307 }
Tim Petersced69f82003-09-16 20:30:58 +00005308
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 result = tailmatch((PyUnicodeObject *)str,
5310 (PyUnicodeObject *)substr,
5311 start, end, direction);
5312 Py_DECREF(str);
5313 Py_DECREF(substr);
5314 return result;
5315}
5316
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317/* Apply fixfct filter to the Unicode object self and return a
5318 reference to the modified object */
5319
Tim Petersced69f82003-09-16 20:30:58 +00005320static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321PyObject *fixup(PyUnicodeObject *self,
5322 int (*fixfct)(PyUnicodeObject *s))
5323{
5324
5325 PyUnicodeObject *u;
5326
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005327 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 if (u == NULL)
5329 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005330
5331 Py_UNICODE_COPY(u->str, self->str, self->length);
5332
Tim Peters7a29bd52001-09-12 03:03:31 +00005333 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 /* fixfct should return TRUE if it modified the buffer. If
5335 FALSE, return a reference to the original buffer instead
5336 (to save space, not time) */
5337 Py_INCREF(self);
5338 Py_DECREF(u);
5339 return (PyObject*) self;
5340 }
5341 return (PyObject*) u;
5342}
5343
Tim Petersced69f82003-09-16 20:30:58 +00005344static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345int fixupper(PyUnicodeObject *self)
5346{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005347 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 Py_UNICODE *s = self->str;
5349 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005350
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 while (len-- > 0) {
5352 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005353
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 ch = Py_UNICODE_TOUPPER(*s);
5355 if (ch != *s) {
5356 status = 1;
5357 *s = ch;
5358 }
5359 s++;
5360 }
5361
5362 return status;
5363}
5364
Tim Petersced69f82003-09-16 20:30:58 +00005365static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366int fixlower(PyUnicodeObject *self)
5367{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005368 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 Py_UNICODE *s = self->str;
5370 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005371
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 while (len-- > 0) {
5373 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 ch = Py_UNICODE_TOLOWER(*s);
5376 if (ch != *s) {
5377 status = 1;
5378 *s = ch;
5379 }
5380 s++;
5381 }
5382
5383 return status;
5384}
5385
Tim Petersced69f82003-09-16 20:30:58 +00005386static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387int fixswapcase(PyUnicodeObject *self)
5388{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005389 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 Py_UNICODE *s = self->str;
5391 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005392
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 while (len-- > 0) {
5394 if (Py_UNICODE_ISUPPER(*s)) {
5395 *s = Py_UNICODE_TOLOWER(*s);
5396 status = 1;
5397 } else if (Py_UNICODE_ISLOWER(*s)) {
5398 *s = Py_UNICODE_TOUPPER(*s);
5399 status = 1;
5400 }
5401 s++;
5402 }
5403
5404 return status;
5405}
5406
Tim Petersced69f82003-09-16 20:30:58 +00005407static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408int fixcapitalize(PyUnicodeObject *self)
5409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005410 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005411 Py_UNICODE *s = self->str;
5412 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005413
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005414 if (len == 0)
5415 return 0;
5416 if (Py_UNICODE_ISLOWER(*s)) {
5417 *s = Py_UNICODE_TOUPPER(*s);
5418 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005420 s++;
5421 while (--len > 0) {
5422 if (Py_UNICODE_ISUPPER(*s)) {
5423 *s = Py_UNICODE_TOLOWER(*s);
5424 status = 1;
5425 }
5426 s++;
5427 }
5428 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429}
5430
5431static
5432int fixtitle(PyUnicodeObject *self)
5433{
5434 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5435 register Py_UNICODE *e;
5436 int previous_is_cased;
5437
5438 /* Shortcut for single character strings */
5439 if (PyUnicode_GET_SIZE(self) == 1) {
5440 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5441 if (*p != ch) {
5442 *p = ch;
5443 return 1;
5444 }
5445 else
5446 return 0;
5447 }
Tim Petersced69f82003-09-16 20:30:58 +00005448
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 e = p + PyUnicode_GET_SIZE(self);
5450 previous_is_cased = 0;
5451 for (; p < e; p++) {
5452 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005453
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 if (previous_is_cased)
5455 *p = Py_UNICODE_TOLOWER(ch);
5456 else
5457 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005458
5459 if (Py_UNICODE_ISLOWER(ch) ||
5460 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 Py_UNICODE_ISTITLE(ch))
5462 previous_is_cased = 1;
5463 else
5464 previous_is_cased = 0;
5465 }
5466 return 1;
5467}
5468
Tim Peters8ce9f162004-08-27 01:49:32 +00005469PyObject *
5470PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471{
Tim Peters8ce9f162004-08-27 01:49:32 +00005472 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005473 const Py_UNICODE blank = ' ';
5474 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005475 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005476 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005477 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5478 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005479 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5480 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005482 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005483 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
Tim Peters05eba1f2004-08-27 21:32:02 +00005485 fseq = PySequence_Fast(seq, "");
5486 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005487 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005488 }
5489
Tim Peters91879ab2004-08-27 22:35:44 +00005490 /* Grrrr. A codec may be invoked to convert str objects to
5491 * Unicode, and so it's possible to call back into Python code
5492 * during PyUnicode_FromObject(), and so it's possible for a sick
5493 * codec to change the size of fseq (if seq is a list). Therefore
5494 * we have to keep refetching the size -- can't assume seqlen
5495 * is invariant.
5496 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005497 seqlen = PySequence_Fast_GET_SIZE(fseq);
5498 /* If empty sequence, return u"". */
5499 if (seqlen == 0) {
5500 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5501 goto Done;
5502 }
5503 /* If singleton sequence with an exact Unicode, return that. */
5504 if (seqlen == 1) {
5505 item = PySequence_Fast_GET_ITEM(fseq, 0);
5506 if (PyUnicode_CheckExact(item)) {
5507 Py_INCREF(item);
5508 res = (PyUnicodeObject *)item;
5509 goto Done;
5510 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005511 }
5512
Tim Peters05eba1f2004-08-27 21:32:02 +00005513 /* At least two items to join, or one that isn't exact Unicode. */
5514 if (seqlen > 1) {
5515 /* Set up sep and seplen -- they're needed. */
5516 if (separator == NULL) {
5517 sep = &blank;
5518 seplen = 1;
5519 }
5520 else {
5521 internal_separator = PyUnicode_FromObject(separator);
5522 if (internal_separator == NULL)
5523 goto onError;
5524 sep = PyUnicode_AS_UNICODE(internal_separator);
5525 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005526 /* In case PyUnicode_FromObject() mutated seq. */
5527 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005528 }
5529 }
5530
5531 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005532 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005533 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005534 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005535 res_p = PyUnicode_AS_UNICODE(res);
5536 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005537
Tim Peters05eba1f2004-08-27 21:32:02 +00005538 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005539 Py_ssize_t itemlen;
5540 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005541
5542 item = PySequence_Fast_GET_ITEM(fseq, i);
5543 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005544 if (!PyUnicode_Check(item)) {
5545 PyErr_Format(PyExc_TypeError,
5546 "sequence item %zd: expected str instance,"
5547 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005548 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005549 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005550 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005551 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005552 if (item == NULL)
5553 goto onError;
5554 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005555
Tim Peters91879ab2004-08-27 22:35:44 +00005556 /* In case PyUnicode_FromObject() mutated seq. */
5557 seqlen = PySequence_Fast_GET_SIZE(fseq);
5558
Tim Peters8ce9f162004-08-27 01:49:32 +00005559 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005561 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005562 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005563 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005564 if (i < seqlen - 1) {
5565 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005566 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005567 goto Overflow;
5568 }
5569 if (new_res_used > res_alloc) {
5570 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005571 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005572 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005573 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005574 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005575 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005576 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005577 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005579 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005580 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005582
5583 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005584 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005585 res_p += itemlen;
5586 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005587 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005588 res_p += seplen;
5589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005591 res_used = new_res_used;
5592 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005593
Tim Peters05eba1f2004-08-27 21:32:02 +00005594 /* Shrink res to match the used area; this probably can't fail,
5595 * but it's cheap to check.
5596 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005597 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005598 goto onError;
5599
5600 Done:
5601 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005602 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 return (PyObject *)res;
5604
Tim Peters8ce9f162004-08-27 01:49:32 +00005605 Overflow:
5606 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005607 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005608 Py_DECREF(item);
5609 /* fall through */
5610
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005612 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005613 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005614 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 return NULL;
5616}
5617
Tim Petersced69f82003-09-16 20:30:58 +00005618static
5619PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005620 Py_ssize_t left,
5621 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 Py_UNICODE fill)
5623{
5624 PyUnicodeObject *u;
5625
5626 if (left < 0)
5627 left = 0;
5628 if (right < 0)
5629 right = 0;
5630
Tim Peters7a29bd52001-09-12 03:03:31 +00005631 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 Py_INCREF(self);
5633 return self;
5634 }
5635
5636 u = _PyUnicode_New(left + self->length + right);
5637 if (u) {
5638 if (left)
5639 Py_UNICODE_FILL(u->str, fill, left);
5640 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5641 if (right)
5642 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5643 }
5644
5645 return u;
5646}
5647
5648#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005649 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 if (!str) \
5651 goto onError; \
5652 if (PyList_Append(list, str)) { \
5653 Py_DECREF(str); \
5654 goto onError; \
5655 } \
5656 else \
5657 Py_DECREF(str);
5658
5659static
5660PyObject *split_whitespace(PyUnicodeObject *self,
5661 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005662 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005664 register Py_ssize_t i;
5665 register Py_ssize_t j;
5666 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005668 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669
5670 for (i = j = 0; i < len; ) {
5671 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005672 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 i++;
5674 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005675 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 i++;
5677 if (j < i) {
5678 if (maxcount-- <= 0)
5679 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005680 SPLIT_APPEND(buf, j, i);
5681 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 i++;
5683 j = i;
5684 }
5685 }
5686 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005687 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 }
5689 return list;
5690
5691 onError:
5692 Py_DECREF(list);
5693 return NULL;
5694}
5695
5696PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005697 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005699 register Py_ssize_t i;
5700 register Py_ssize_t j;
5701 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 PyObject *list;
5703 PyObject *str;
5704 Py_UNICODE *data;
5705
5706 string = PyUnicode_FromObject(string);
5707 if (string == NULL)
5708 return NULL;
5709 data = PyUnicode_AS_UNICODE(string);
5710 len = PyUnicode_GET_SIZE(string);
5711
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 list = PyList_New(0);
5713 if (!list)
5714 goto onError;
5715
5716 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005717 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005720 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722
5723 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005724 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 if (i < len) {
5726 if (data[i] == '\r' && i + 1 < len &&
5727 data[i+1] == '\n')
5728 i += 2;
5729 else
5730 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005731 if (keepends)
5732 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 }
Guido van Rossum86662912000-04-11 15:38:46 +00005734 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 j = i;
5736 }
5737 if (j < len) {
5738 SPLIT_APPEND(data, j, len);
5739 }
5740
5741 Py_DECREF(string);
5742 return list;
5743
5744 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005745 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 Py_DECREF(string);
5747 return NULL;
5748}
5749
Tim Petersced69f82003-09-16 20:30:58 +00005750static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751PyObject *split_char(PyUnicodeObject *self,
5752 PyObject *list,
5753 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005756 register Py_ssize_t i;
5757 register Py_ssize_t j;
5758 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005760 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
5762 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005763 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 if (maxcount-- <= 0)
5765 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005766 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 i = j = i + 1;
5768 } else
5769 i++;
5770 }
5771 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005772 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 }
5774 return list;
5775
5776 onError:
5777 Py_DECREF(list);
5778 return NULL;
5779}
5780
Tim Petersced69f82003-09-16 20:30:58 +00005781static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782PyObject *split_substring(PyUnicodeObject *self,
5783 PyObject *list,
5784 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005785 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005787 register Py_ssize_t i;
5788 register Py_ssize_t j;
5789 Py_ssize_t len = self->length;
5790 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 PyObject *str;
5792
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005793 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 if (Py_UNICODE_MATCH(self, i, substring)) {
5795 if (maxcount-- <= 0)
5796 break;
5797 SPLIT_APPEND(self->str, j, i);
5798 i = j = i + sublen;
5799 } else
5800 i++;
5801 }
5802 if (j <= len) {
5803 SPLIT_APPEND(self->str, j, len);
5804 }
5805 return list;
5806
5807 onError:
5808 Py_DECREF(list);
5809 return NULL;
5810}
5811
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005812static
5813PyObject *rsplit_whitespace(PyUnicodeObject *self,
5814 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005815 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005816{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005817 register Py_ssize_t i;
5818 register Py_ssize_t j;
5819 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005820 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005821 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005822
5823 for (i = j = len - 1; i >= 0; ) {
5824 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005825 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005826 i--;
5827 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005828 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005829 i--;
5830 if (j > i) {
5831 if (maxcount-- <= 0)
5832 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005833 SPLIT_APPEND(buf, i + 1, j + 1);
5834 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005835 i--;
5836 j = i;
5837 }
5838 }
5839 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005840 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005841 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842 if (PyList_Reverse(list) < 0)
5843 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005844 return list;
5845
5846 onError:
5847 Py_DECREF(list);
5848 return NULL;
5849}
5850
5851static
5852PyObject *rsplit_char(PyUnicodeObject *self,
5853 PyObject *list,
5854 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005855 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005856{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005857 register Py_ssize_t i;
5858 register Py_ssize_t j;
5859 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005861 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005862
5863 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005864 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005865 if (maxcount-- <= 0)
5866 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005867 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868 j = i = i - 1;
5869 } else
5870 i--;
5871 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005872 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005873 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005874 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005875 if (PyList_Reverse(list) < 0)
5876 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005877 return list;
5878
5879 onError:
5880 Py_DECREF(list);
5881 return NULL;
5882}
5883
5884static
5885PyObject *rsplit_substring(PyUnicodeObject *self,
5886 PyObject *list,
5887 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005888 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005889{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005890 register Py_ssize_t i;
5891 register Py_ssize_t j;
5892 Py_ssize_t len = self->length;
5893 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005894 PyObject *str;
5895
5896 for (i = len - sublen, j = len; i >= 0; ) {
5897 if (Py_UNICODE_MATCH(self, i, substring)) {
5898 if (maxcount-- <= 0)
5899 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005900 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005901 j = i;
5902 i -= sublen;
5903 } else
5904 i--;
5905 }
5906 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005907 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005908 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005909 if (PyList_Reverse(list) < 0)
5910 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005911 return list;
5912
5913 onError:
5914 Py_DECREF(list);
5915 return NULL;
5916}
5917
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918#undef SPLIT_APPEND
5919
5920static
5921PyObject *split(PyUnicodeObject *self,
5922 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005923 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
5925 PyObject *list;
5926
5927 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005928 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929
5930 list = PyList_New(0);
5931 if (!list)
5932 return NULL;
5933
5934 if (substring == NULL)
5935 return split_whitespace(self,list,maxcount);
5936
5937 else if (substring->length == 1)
5938 return split_char(self,list,substring->str[0],maxcount);
5939
5940 else if (substring->length == 0) {
5941 Py_DECREF(list);
5942 PyErr_SetString(PyExc_ValueError, "empty separator");
5943 return NULL;
5944 }
5945 else
5946 return split_substring(self,list,substring,maxcount);
5947}
5948
Tim Petersced69f82003-09-16 20:30:58 +00005949static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005950PyObject *rsplit(PyUnicodeObject *self,
5951 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005952 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005953{
5954 PyObject *list;
5955
5956 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005957 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005958
5959 list = PyList_New(0);
5960 if (!list)
5961 return NULL;
5962
5963 if (substring == NULL)
5964 return rsplit_whitespace(self,list,maxcount);
5965
5966 else if (substring->length == 1)
5967 return rsplit_char(self,list,substring->str[0],maxcount);
5968
5969 else if (substring->length == 0) {
5970 Py_DECREF(list);
5971 PyErr_SetString(PyExc_ValueError, "empty separator");
5972 return NULL;
5973 }
5974 else
5975 return rsplit_substring(self,list,substring,maxcount);
5976}
5977
5978static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979PyObject *replace(PyUnicodeObject *self,
5980 PyUnicodeObject *str1,
5981 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005982 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983{
5984 PyUnicodeObject *u;
5985
5986 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005987 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
Thomas Wouters477c8d52006-05-27 19:21:47 +00005989 if (str1->length == str2->length) {
5990 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005991 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005992 if (str1->length == 1) {
5993 /* replace characters */
5994 Py_UNICODE u1, u2;
5995 if (!findchar(self->str, self->length, str1->str[0]))
5996 goto nothing;
5997 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5998 if (!u)
5999 return NULL;
6000 Py_UNICODE_COPY(u->str, self->str, self->length);
6001 u1 = str1->str[0];
6002 u2 = str2->str[0];
6003 for (i = 0; i < u->length; i++)
6004 if (u->str[i] == u1) {
6005 if (--maxcount < 0)
6006 break;
6007 u->str[i] = u2;
6008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006010 i = fastsearch(
6011 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006013 if (i < 0)
6014 goto nothing;
6015 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6016 if (!u)
6017 return NULL;
6018 Py_UNICODE_COPY(u->str, self->str, self->length);
6019 while (i <= self->length - str1->length)
6020 if (Py_UNICODE_MATCH(self, i, str1)) {
6021 if (--maxcount < 0)
6022 break;
6023 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6024 i += str1->length;
6025 } else
6026 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006029
6030 Py_ssize_t n, i, j, e;
6031 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 Py_UNICODE *p;
6033
6034 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006035 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 if (n > maxcount)
6037 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006038 if (n == 0)
6039 goto nothing;
6040 /* new_size = self->length + n * (str2->length - str1->length)); */
6041 delta = (str2->length - str1->length);
6042 if (delta == 0) {
6043 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006045 product = n * (str2->length - str1->length);
6046 if ((product / (str2->length - str1->length)) != n) {
6047 PyErr_SetString(PyExc_OverflowError,
6048 "replace string is too long");
6049 return NULL;
6050 }
6051 new_size = self->length + product;
6052 if (new_size < 0) {
6053 PyErr_SetString(PyExc_OverflowError,
6054 "replace string is too long");
6055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 }
6057 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006058 u = _PyUnicode_New(new_size);
6059 if (!u)
6060 return NULL;
6061 i = 0;
6062 p = u->str;
6063 e = self->length - str1->length;
6064 if (str1->length > 0) {
6065 while (n-- > 0) {
6066 /* look for next match */
6067 j = i;
6068 while (j <= e) {
6069 if (Py_UNICODE_MATCH(self, j, str1))
6070 break;
6071 j++;
6072 }
6073 if (j > i) {
6074 if (j > e)
6075 break;
6076 /* copy unchanged part [i:j] */
6077 Py_UNICODE_COPY(p, self->str+i, j-i);
6078 p += j - i;
6079 }
6080 /* copy substitution string */
6081 if (str2->length > 0) {
6082 Py_UNICODE_COPY(p, str2->str, str2->length);
6083 p += str2->length;
6084 }
6085 i = j + str1->length;
6086 }
6087 if (i < self->length)
6088 /* copy tail [i:] */
6089 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6090 } else {
6091 /* interleave */
6092 while (n > 0) {
6093 Py_UNICODE_COPY(p, str2->str, str2->length);
6094 p += str2->length;
6095 if (--n <= 0)
6096 break;
6097 *p++ = self->str[i++];
6098 }
6099 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6100 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006103
6104nothing:
6105 /* nothing to replace; return original string (when possible) */
6106 if (PyUnicode_CheckExact(self)) {
6107 Py_INCREF(self);
6108 return (PyObject *) self;
6109 }
6110 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111}
6112
6113/* --- Unicode Object Methods --------------------------------------------- */
6114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006115PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116"S.title() -> unicode\n\
6117\n\
6118Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006119characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120
6121static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006122unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 return fixup(self, fixtitle);
6125}
6126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006127PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128"S.capitalize() -> unicode\n\
6129\n\
6130Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006131have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
6133static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006134unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 return fixup(self, fixcapitalize);
6137}
6138
6139#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006140PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141"S.capwords() -> unicode\n\
6142\n\
6143Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006144normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
6146static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006147unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148{
6149 PyObject *list;
6150 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006151 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 /* Split into words */
6154 list = split(self, NULL, -1);
6155 if (!list)
6156 return NULL;
6157
6158 /* Capitalize each word */
6159 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6160 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6161 fixcapitalize);
6162 if (item == NULL)
6163 goto onError;
6164 Py_DECREF(PyList_GET_ITEM(list, i));
6165 PyList_SET_ITEM(list, i, item);
6166 }
6167
6168 /* Join the words to form a new string */
6169 item = PyUnicode_Join(NULL, list);
6170
6171onError:
6172 Py_DECREF(list);
6173 return (PyObject *)item;
6174}
6175#endif
6176
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006177/* Argument converter. Coerces to a single unicode character */
6178
6179static int
6180convert_uc(PyObject *obj, void *addr)
6181{
6182 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6183 PyObject *uniobj;
6184 Py_UNICODE *unistr;
6185
6186 uniobj = PyUnicode_FromObject(obj);
6187 if (uniobj == NULL) {
6188 PyErr_SetString(PyExc_TypeError,
6189 "The fill character cannot be converted to Unicode");
6190 return 0;
6191 }
6192 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6193 PyErr_SetString(PyExc_TypeError,
6194 "The fill character must be exactly one character long");
6195 Py_DECREF(uniobj);
6196 return 0;
6197 }
6198 unistr = PyUnicode_AS_UNICODE(uniobj);
6199 *fillcharloc = unistr[0];
6200 Py_DECREF(uniobj);
6201 return 1;
6202}
6203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006204PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006205"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006207Return S centered in a Unicode string of length width. Padding is\n\
6208done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209
6210static PyObject *
6211unicode_center(PyUnicodeObject *self, PyObject *args)
6212{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006213 Py_ssize_t marg, left;
6214 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006215 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216
Thomas Woutersde017742006-02-16 19:34:37 +00006217 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 return NULL;
6219
Tim Peters7a29bd52001-09-12 03:03:31 +00006220 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 Py_INCREF(self);
6222 return (PyObject*) self;
6223 }
6224
6225 marg = width - self->length;
6226 left = marg / 2 + (marg & width & 1);
6227
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006228 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229}
6230
Marc-André Lemburge5034372000-08-08 08:04:29 +00006231#if 0
6232
6233/* This code should go into some future Unicode collation support
6234 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006235 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006236
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006237/* speedy UTF-16 code point order comparison */
6238/* gleaned from: */
6239/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6240
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006241static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006242{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006243 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006244 0, 0, 0, 0, 0, 0, 0, 0,
6245 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006246 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006247};
6248
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249static int
6250unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6251{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006252 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006253
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 Py_UNICODE *s1 = str1->str;
6255 Py_UNICODE *s2 = str2->str;
6256
6257 len1 = str1->length;
6258 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006259
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006261 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006262
6263 c1 = *s1++;
6264 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006265
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006266 if (c1 > (1<<11) * 26)
6267 c1 += utf16Fixup[c1>>11];
6268 if (c2 > (1<<11) * 26)
6269 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006270 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006271
6272 if (c1 != c2)
6273 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006274
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006275 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 }
6277
6278 return (len1 < len2) ? -1 : (len1 != len2);
6279}
6280
Marc-André Lemburge5034372000-08-08 08:04:29 +00006281#else
6282
6283static int
6284unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6285{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006286 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006287
6288 Py_UNICODE *s1 = str1->str;
6289 Py_UNICODE *s2 = str2->str;
6290
6291 len1 = str1->length;
6292 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006293
Marc-André Lemburge5034372000-08-08 08:04:29 +00006294 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006295 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006296
Fredrik Lundh45714e92001-06-26 16:39:36 +00006297 c1 = *s1++;
6298 c2 = *s2++;
6299
6300 if (c1 != c2)
6301 return (c1 < c2) ? -1 : 1;
6302
Marc-André Lemburge5034372000-08-08 08:04:29 +00006303 len1--; len2--;
6304 }
6305
6306 return (len1 < len2) ? -1 : (len1 != len2);
6307}
6308
6309#endif
6310
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311int PyUnicode_Compare(PyObject *left,
6312 PyObject *right)
6313{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006314 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6315 return unicode_compare((PyUnicodeObject *)left,
6316 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006317 PyErr_Format(PyExc_TypeError,
6318 "Can't compare %.100s and %.100s",
6319 left->ob_type->tp_name,
6320 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 return -1;
6322}
6323
Martin v. Löwis5b222132007-06-10 09:51:05 +00006324int
6325PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6326{
6327 int i;
6328 Py_UNICODE *id;
6329 assert(PyUnicode_Check(uni));
6330 id = PyUnicode_AS_UNICODE(uni);
6331 /* Compare Unicode string and source character set string */
6332 for (i = 0; id[i] && str[i]; i++)
6333 if (id[i] != str[i])
6334 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6335 if (id[i])
6336 return 1; /* uni is longer */
6337 if (str[i])
6338 return -1; /* str is longer */
6339 return 0;
6340}
6341
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006342PyObject *PyUnicode_RichCompare(PyObject *left,
6343 PyObject *right,
6344 int op)
6345{
6346 int result;
6347
6348 result = PyUnicode_Compare(left, right);
6349 if (result == -1 && PyErr_Occurred())
6350 goto onError;
6351
6352 /* Convert the return value to a Boolean */
6353 switch (op) {
6354 case Py_EQ:
6355 result = (result == 0);
6356 break;
6357 case Py_NE:
6358 result = (result != 0);
6359 break;
6360 case Py_LE:
6361 result = (result <= 0);
6362 break;
6363 case Py_GE:
6364 result = (result >= 0);
6365 break;
6366 case Py_LT:
6367 result = (result == -1);
6368 break;
6369 case Py_GT:
6370 result = (result == 1);
6371 break;
6372 }
6373 return PyBool_FromLong(result);
6374
6375 onError:
6376
6377 /* Standard case
6378
6379 Type errors mean that PyUnicode_FromObject() could not convert
6380 one of the arguments (usually the right hand side) to Unicode,
6381 ie. we can't handle the comparison request. However, it is
6382 possible that the other object knows a comparison method, which
6383 is why we return Py_NotImplemented to give the other object a
6384 chance.
6385
6386 */
6387 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6388 PyErr_Clear();
6389 Py_INCREF(Py_NotImplemented);
6390 return Py_NotImplemented;
6391 }
6392 if (op != Py_EQ && op != Py_NE)
6393 return NULL;
6394
6395 /* Equality comparison.
6396
6397 This is a special case: we silence any PyExc_UnicodeDecodeError
6398 and instead turn it into a PyErr_UnicodeWarning.
6399
6400 */
6401 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6402 return NULL;
6403 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006404 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6405 (op == Py_EQ) ?
6406 "Unicode equal comparison "
6407 "failed to convert both arguments to Unicode - "
6408 "interpreting them as being unequal"
6409 :
6410 "Unicode unequal comparison "
6411 "failed to convert both arguments to Unicode - "
6412 "interpreting them as being unequal",
6413 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006414 return NULL;
6415 result = (op == Py_NE);
6416 return PyBool_FromLong(result);
6417}
6418
Guido van Rossum403d68b2000-03-13 15:55:09 +00006419int PyUnicode_Contains(PyObject *container,
6420 PyObject *element)
6421{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006422 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006423 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006424
6425 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006426 sub = PyUnicode_FromObject(element);
6427 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006428 PyErr_Format(PyExc_TypeError,
6429 "'in <string>' requires string as left operand, not %s",
6430 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006431 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006432 }
6433
Thomas Wouters477c8d52006-05-27 19:21:47 +00006434 str = PyUnicode_FromObject(container);
6435 if (!str) {
6436 Py_DECREF(sub);
6437 return -1;
6438 }
6439
6440 result = stringlib_contains_obj(str, sub);
6441
6442 Py_DECREF(str);
6443 Py_DECREF(sub);
6444
Guido van Rossum403d68b2000-03-13 15:55:09 +00006445 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006446}
6447
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448/* Concat to string or Unicode object giving a new Unicode object. */
6449
6450PyObject *PyUnicode_Concat(PyObject *left,
6451 PyObject *right)
6452{
6453 PyUnicodeObject *u = NULL, *v = NULL, *w;
6454
6455 /* Coerce the two arguments */
6456 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6457 if (u == NULL)
6458 goto onError;
6459 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6460 if (v == NULL)
6461 goto onError;
6462
6463 /* Shortcuts */
6464 if (v == unicode_empty) {
6465 Py_DECREF(v);
6466 return (PyObject *)u;
6467 }
6468 if (u == unicode_empty) {
6469 Py_DECREF(u);
6470 return (PyObject *)v;
6471 }
6472
6473 /* Concat the two Unicode strings */
6474 w = _PyUnicode_New(u->length + v->length);
6475 if (w == NULL)
6476 goto onError;
6477 Py_UNICODE_COPY(w->str, u->str, u->length);
6478 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6479
6480 Py_DECREF(u);
6481 Py_DECREF(v);
6482 return (PyObject *)w;
6483
6484onError:
6485 Py_XDECREF(u);
6486 Py_XDECREF(v);
6487 return NULL;
6488}
6489
Walter Dörwald1ab83302007-05-18 17:15:44 +00006490void
6491PyUnicode_Append(PyObject **pleft, PyObject *right)
6492{
6493 PyObject *new;
6494 if (*pleft == NULL)
6495 return;
6496 if (right == NULL || !PyUnicode_Check(*pleft)) {
6497 Py_DECREF(*pleft);
6498 *pleft = NULL;
6499 return;
6500 }
6501 new = PyUnicode_Concat(*pleft, right);
6502 Py_DECREF(*pleft);
6503 *pleft = new;
6504}
6505
6506void
6507PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6508{
6509 PyUnicode_Append(pleft, right);
6510 Py_XDECREF(right);
6511}
6512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006513PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514"S.count(sub[, start[, end]]) -> int\n\
6515\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516Return the number of non-overlapping occurrences of substring sub in\n\
6517Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006518interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519
6520static PyObject *
6521unicode_count(PyUnicodeObject *self, PyObject *args)
6522{
6523 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006524 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006525 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 PyObject *result;
6527
Guido van Rossumb8872e62000-05-09 14:14:27 +00006528 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6529 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 return NULL;
6531
6532 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006533 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 if (substring == NULL)
6535 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006536
Thomas Wouters477c8d52006-05-27 19:21:47 +00006537 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
Christian Heimes217cfd12007-12-02 14:31:20 +00006539 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006540 stringlib_count(self->str + start, end - start,
6541 substring->str, substring->length)
6542 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543
6544 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 return result;
6547}
6548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006549PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006550"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006552Encodes S using the codec registered for encoding. encoding defaults\n\
6553to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006554handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006555a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6556'xmlcharrefreplace' as well as any other name registered with\n\
6557codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558
6559static PyObject *
6560unicode_encode(PyUnicodeObject *self, PyObject *args)
6561{
6562 char *encoding = NULL;
6563 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006564 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006565
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6567 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006568 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006569 if (v == NULL)
6570 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006571 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006572 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006573 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006574 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006575 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006576 Py_DECREF(v);
6577 return NULL;
6578 }
6579 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006580
6581 onError:
6582 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006583}
6584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006585PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586"S.expandtabs([tabsize]) -> unicode\n\
6587\n\
6588Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006589If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
6591static PyObject*
6592unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6593{
6594 Py_UNICODE *e;
6595 Py_UNICODE *p;
6596 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006597 Py_UNICODE *qe;
6598 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 PyUnicodeObject *u;
6600 int tabsize = 8;
6601
6602 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6603 return NULL;
6604
Thomas Wouters7e474022000-07-16 12:04:32 +00006605 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006606 i = 0; /* chars up to and including most recent \n or \r */
6607 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6608 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 for (p = self->str; p < e; p++)
6610 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006611 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006612 incr = tabsize - (j % tabsize); /* cannot overflow */
6613 if (j > PY_SSIZE_T_MAX - incr)
6614 goto overflow1;
6615 j += incr;
6616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 }
6618 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006619 if (j > PY_SSIZE_T_MAX - 1)
6620 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 j++;
6622 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006623 if (i > PY_SSIZE_T_MAX - j)
6624 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006626 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
6628 }
6629
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006630 if (i > PY_SSIZE_T_MAX - j)
6631 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006632
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 /* Second pass: create output string and fill it */
6634 u = _PyUnicode_New(i + j);
6635 if (!u)
6636 return NULL;
6637
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006638 j = 0; /* same as in first pass */
6639 q = u->str; /* next output char */
6640 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
6642 for (p = self->str; p < e; p++)
6643 if (*p == '\t') {
6644 if (tabsize > 0) {
6645 i = tabsize - (j % tabsize);
6646 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006647 while (i--) {
6648 if (q >= qe)
6649 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 }
6653 }
6654 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006655 if (q >= qe)
6656 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006658 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 if (*p == '\n' || *p == '\r')
6660 j = 0;
6661 }
6662
6663 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006664
6665 overflow2:
6666 Py_DECREF(u);
6667 overflow1:
6668 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670}
6671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006672PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673"S.find(sub [,start [,end]]) -> int\n\
6674\n\
6675Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006676such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677arguments start and end are interpreted as in slice notation.\n\
6678\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006679Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
6681static PyObject *
6682unicode_find(PyUnicodeObject *self, PyObject *args)
6683{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006684 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006685 Py_ssize_t start;
6686 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006687 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
Christian Heimes9cd17752007-11-18 19:35:23 +00006689 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691
Thomas Wouters477c8d52006-05-27 19:21:47 +00006692 result = stringlib_find_slice(
6693 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6694 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6695 start, end
6696 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697
6698 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006699
Christian Heimes217cfd12007-12-02 14:31:20 +00006700 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701}
6702
6703static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006704unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705{
6706 if (index < 0 || index >= self->length) {
6707 PyErr_SetString(PyExc_IndexError, "string index out of range");
6708 return NULL;
6709 }
6710
6711 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6712}
6713
Guido van Rossumc2504932007-09-18 19:42:40 +00006714/* Believe it or not, this produces the same value for ASCII strings
6715 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006717unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718{
Guido van Rossumc2504932007-09-18 19:42:40 +00006719 Py_ssize_t len;
6720 Py_UNICODE *p;
6721 long x;
6722
6723 if (self->hash != -1)
6724 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006725 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006726 p = self->str;
6727 x = *p << 7;
6728 while (--len >= 0)
6729 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006730 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006731 if (x == -1)
6732 x = -2;
6733 self->hash = x;
6734 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735}
6736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006737PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738"S.index(sub [,start [,end]]) -> int\n\
6739\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006740Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741
6742static PyObject *
6743unicode_index(PyUnicodeObject *self, PyObject *args)
6744{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006745 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006746 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006747 Py_ssize_t start;
6748 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749
Christian Heimes9cd17752007-11-18 19:35:23 +00006750 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752
Thomas Wouters477c8d52006-05-27 19:21:47 +00006753 result = stringlib_find_slice(
6754 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6755 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6756 start, end
6757 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758
6759 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 if (result < 0) {
6762 PyErr_SetString(PyExc_ValueError, "substring not found");
6763 return NULL;
6764 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006765
Christian Heimes217cfd12007-12-02 14:31:20 +00006766 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767}
6768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006769PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006770"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006772Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006773at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774
6775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006776unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777{
6778 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6779 register const Py_UNICODE *e;
6780 int cased;
6781
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 /* Shortcut for single character strings */
6783 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006784 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006786 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006787 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006788 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006789
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 e = p + PyUnicode_GET_SIZE(self);
6791 cased = 0;
6792 for (; p < e; p++) {
6793 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006794
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006796 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 else if (!cased && Py_UNICODE_ISLOWER(ch))
6798 cased = 1;
6799 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006800 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801}
6802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006804"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006806Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006807at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808
6809static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006810unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811{
6812 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6813 register const Py_UNICODE *e;
6814 int cased;
6815
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 /* Shortcut for single character strings */
6817 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006818 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006820 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006821 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006823
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 e = p + PyUnicode_GET_SIZE(self);
6825 cased = 0;
6826 for (; p < e; p++) {
6827 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006828
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006830 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 else if (!cased && Py_UNICODE_ISUPPER(ch))
6832 cased = 1;
6833 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006834 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835}
6836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006837PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006838"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006840Return True if S is a titlecased string and there is at least one\n\
6841character in S, i.e. upper- and titlecase characters may only\n\
6842follow uncased characters and lowercase characters only cased ones.\n\
6843Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
6845static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006846unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847{
6848 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6849 register const Py_UNICODE *e;
6850 int cased, previous_is_cased;
6851
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 /* Shortcut for single character strings */
6853 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6855 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006857 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006858 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006859 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006860
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 e = p + PyUnicode_GET_SIZE(self);
6862 cased = 0;
6863 previous_is_cased = 0;
6864 for (; p < e; p++) {
6865 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006866
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6868 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 previous_is_cased = 1;
6871 cased = 1;
6872 }
6873 else if (Py_UNICODE_ISLOWER(ch)) {
6874 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006875 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 previous_is_cased = 1;
6877 cased = 1;
6878 }
6879 else
6880 previous_is_cased = 0;
6881 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883}
6884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006885PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006886"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006888Return True if all characters in S are whitespace\n\
6889and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890
6891static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006892unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893{
6894 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6895 register const Py_UNICODE *e;
6896
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 /* Shortcut for single character strings */
6898 if (PyUnicode_GET_SIZE(self) == 1 &&
6899 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006902 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006903 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006905
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 e = p + PyUnicode_GET_SIZE(self);
6907 for (; p < e; p++) {
6908 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006909 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006911 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912}
6913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006914PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006915"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006916\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006917Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006918and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006919
6920static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006921unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006922{
6923 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6924 register const Py_UNICODE *e;
6925
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006926 /* Shortcut for single character strings */
6927 if (PyUnicode_GET_SIZE(self) == 1 &&
6928 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006929 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006930
6931 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006932 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006933 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006934
6935 e = p + PyUnicode_GET_SIZE(self);
6936 for (; p < e; p++) {
6937 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006939 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006940 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006941}
6942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006943PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006945\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006946Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006947and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006948
6949static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006950unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006951{
6952 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6953 register const Py_UNICODE *e;
6954
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006955 /* Shortcut for single character strings */
6956 if (PyUnicode_GET_SIZE(self) == 1 &&
6957 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006958 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006959
6960 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006961 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006963
6964 e = p + PyUnicode_GET_SIZE(self);
6965 for (; p < e; p++) {
6966 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006968 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006969 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006970}
6971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006972PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006973"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006975Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006976False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977
6978static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006979unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980{
6981 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6982 register const Py_UNICODE *e;
6983
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 /* Shortcut for single character strings */
6985 if (PyUnicode_GET_SIZE(self) == 1 &&
6986 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006987 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006989 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006990 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006991 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006992
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 e = p + PyUnicode_GET_SIZE(self);
6994 for (; p < e; p++) {
6995 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006996 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006998 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999}
7000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007001PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007002"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007004Return True if all characters in S are digits\n\
7005and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
7007static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007008unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009{
7010 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7011 register const Py_UNICODE *e;
7012
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 /* Shortcut for single character strings */
7014 if (PyUnicode_GET_SIZE(self) == 1 &&
7015 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007016 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007018 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007019 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007020 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007021
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 e = p + PyUnicode_GET_SIZE(self);
7023 for (; p < e; p++) {
7024 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007025 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007027 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028}
7029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007030PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007031"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007033Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007034False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035
7036static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007037unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038{
7039 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7040 register const Py_UNICODE *e;
7041
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 /* Shortcut for single character strings */
7043 if (PyUnicode_GET_SIZE(self) == 1 &&
7044 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007045 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007047 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007048 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007049 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007050
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 e = p + PyUnicode_GET_SIZE(self);
7052 for (; p < e; p++) {
7053 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007054 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007056 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057}
7058
Martin v. Löwis47383402007-08-15 07:32:56 +00007059int
7060PyUnicode_IsIdentifier(PyObject *self)
7061{
7062 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7063 register const Py_UNICODE *e;
7064
7065 /* Special case for empty strings */
7066 if (PyUnicode_GET_SIZE(self) == 0)
7067 return 0;
7068
7069 /* PEP 3131 says that the first character must be in
7070 XID_Start and subsequent characters in XID_Continue,
7071 and for the ASCII range, the 2.x rules apply (i.e
7072 start with letters and underscore, continue with
7073 letters, digits, underscore). However, given the current
7074 definition of XID_Start and XID_Continue, it is sufficient
7075 to check just for these, except that _ must be allowed
7076 as starting an identifier. */
7077 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7078 return 0;
7079
7080 e = p + PyUnicode_GET_SIZE(self);
7081 for (p++; p < e; p++) {
7082 if (!_PyUnicode_IsXidContinue(*p))
7083 return 0;
7084 }
7085 return 1;
7086}
7087
7088PyDoc_STRVAR(isidentifier__doc__,
7089"S.isidentifier() -> bool\n\
7090\n\
7091Return True if S is a valid identifier according\n\
7092to the language definition.");
7093
7094static PyObject*
7095unicode_isidentifier(PyObject *self)
7096{
7097 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7098}
7099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007100PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101"S.join(sequence) -> unicode\n\
7102\n\
7103Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105
7106static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007107unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007109 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110}
7111
Martin v. Löwis18e16552006-02-15 17:27:45 +00007112static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113unicode_length(PyUnicodeObject *self)
7114{
7115 return self->length;
7116}
7117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007118PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007119"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120\n\
7121Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007122done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123
7124static PyObject *
7125unicode_ljust(PyUnicodeObject *self, PyObject *args)
7126{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007127 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007128 Py_UNICODE fillchar = ' ';
7129
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007130 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 return NULL;
7132
Tim Peters7a29bd52001-09-12 03:03:31 +00007133 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 Py_INCREF(self);
7135 return (PyObject*) self;
7136 }
7137
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007138 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139}
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142"S.lower() -> unicode\n\
7143\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145
7146static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007147unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 return fixup(self, fixlower);
7150}
7151
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007152#define LEFTSTRIP 0
7153#define RIGHTSTRIP 1
7154#define BOTHSTRIP 2
7155
7156/* Arrays indexed by above */
7157static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7158
7159#define STRIPNAME(i) (stripformat[i]+3)
7160
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007161/* externally visible for str.strip(unicode) */
7162PyObject *
7163_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7164{
7165 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007166 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007167 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007168 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7169 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007170
Thomas Wouters477c8d52006-05-27 19:21:47 +00007171 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7172
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007173 i = 0;
7174 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007175 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7176 i++;
7177 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007178 }
7179
7180 j = len;
7181 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007182 do {
7183 j--;
7184 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7185 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007186 }
7187
7188 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007189 Py_INCREF(self);
7190 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007191 }
7192 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007193 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007194}
7195
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
7197static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007198do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007200 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007202
7203 i = 0;
7204 if (striptype != RIGHTSTRIP) {
7205 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7206 i++;
7207 }
7208 }
7209
7210 j = len;
7211 if (striptype != LEFTSTRIP) {
7212 do {
7213 j--;
7214 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7215 j++;
7216 }
7217
7218 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7219 Py_INCREF(self);
7220 return (PyObject*)self;
7221 }
7222 else
7223 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224}
7225
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007226
7227static PyObject *
7228do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7229{
7230 PyObject *sep = NULL;
7231
7232 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7233 return NULL;
7234
7235 if (sep != NULL && sep != Py_None) {
7236 if (PyUnicode_Check(sep))
7237 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007238 else {
7239 PyErr_Format(PyExc_TypeError,
7240 "%s arg must be None, unicode or str",
7241 STRIPNAME(striptype));
7242 return NULL;
7243 }
7244 }
7245
7246 return do_strip(self, striptype);
7247}
7248
7249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007250PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007251"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007252\n\
7253Return a copy of the string S with leading and trailing\n\
7254whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007255If chars is given and not None, remove characters in chars instead.\n\
7256If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007257
7258static PyObject *
7259unicode_strip(PyUnicodeObject *self, PyObject *args)
7260{
7261 if (PyTuple_GET_SIZE(args) == 0)
7262 return do_strip(self, BOTHSTRIP); /* Common case */
7263 else
7264 return do_argstrip(self, BOTHSTRIP, args);
7265}
7266
7267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007268PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007269"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007270\n\
7271Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007272If chars is given and not None, remove characters in chars instead.\n\
7273If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007274
7275static PyObject *
7276unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7277{
7278 if (PyTuple_GET_SIZE(args) == 0)
7279 return do_strip(self, LEFTSTRIP); /* Common case */
7280 else
7281 return do_argstrip(self, LEFTSTRIP, args);
7282}
7283
7284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007285PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007286"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007287\n\
7288Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007289If chars is given and not None, remove characters in chars instead.\n\
7290If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007291
7292static PyObject *
7293unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7294{
7295 if (PyTuple_GET_SIZE(args) == 0)
7296 return do_strip(self, RIGHTSTRIP); /* Common case */
7297 else
7298 return do_argstrip(self, RIGHTSTRIP, args);
7299}
7300
7301
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007303unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304{
7305 PyUnicodeObject *u;
7306 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007307 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007308 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309
7310 if (len < 0)
7311 len = 0;
7312
Tim Peters7a29bd52001-09-12 03:03:31 +00007313 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 /* no repeat, return original string */
7315 Py_INCREF(str);
7316 return (PyObject*) str;
7317 }
Tim Peters8f422462000-09-09 06:13:41 +00007318
7319 /* ensure # of chars needed doesn't overflow int and # of bytes
7320 * needed doesn't overflow size_t
7321 */
7322 nchars = len * str->length;
7323 if (len && nchars / len != str->length) {
7324 PyErr_SetString(PyExc_OverflowError,
7325 "repeated string is too long");
7326 return NULL;
7327 }
7328 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7329 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7330 PyErr_SetString(PyExc_OverflowError,
7331 "repeated string is too long");
7332 return NULL;
7333 }
7334 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 if (!u)
7336 return NULL;
7337
7338 p = u->str;
7339
Thomas Wouters477c8d52006-05-27 19:21:47 +00007340 if (str->length == 1 && len > 0) {
7341 Py_UNICODE_FILL(p, str->str[0], len);
7342 } else {
7343 Py_ssize_t done = 0; /* number of characters copied this far */
7344 if (done < nchars) {
7345 Py_UNICODE_COPY(p, str->str, str->length);
7346 done = str->length;
7347 }
7348 while (done < nchars) {
7349 int n = (done <= nchars-done) ? done : nchars-done;
7350 Py_UNICODE_COPY(p+done, p, n);
7351 done += n;
7352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 }
7354
7355 return (PyObject*) u;
7356}
7357
7358PyObject *PyUnicode_Replace(PyObject *obj,
7359 PyObject *subobj,
7360 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007361 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362{
7363 PyObject *self;
7364 PyObject *str1;
7365 PyObject *str2;
7366 PyObject *result;
7367
7368 self = PyUnicode_FromObject(obj);
7369 if (self == NULL)
7370 return NULL;
7371 str1 = PyUnicode_FromObject(subobj);
7372 if (str1 == NULL) {
7373 Py_DECREF(self);
7374 return NULL;
7375 }
7376 str2 = PyUnicode_FromObject(replobj);
7377 if (str2 == NULL) {
7378 Py_DECREF(self);
7379 Py_DECREF(str1);
7380 return NULL;
7381 }
Tim Petersced69f82003-09-16 20:30:58 +00007382 result = replace((PyUnicodeObject *)self,
7383 (PyUnicodeObject *)str1,
7384 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 maxcount);
7386 Py_DECREF(self);
7387 Py_DECREF(str1);
7388 Py_DECREF(str2);
7389 return result;
7390}
7391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007392PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393"S.replace (old, new[, maxsplit]) -> unicode\n\
7394\n\
7395Return a copy of S with all occurrences of substring\n\
7396old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007397given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398
7399static PyObject*
7400unicode_replace(PyUnicodeObject *self, PyObject *args)
7401{
7402 PyUnicodeObject *str1;
7403 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007404 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 PyObject *result;
7406
Martin v. Löwis18e16552006-02-15 17:27:45 +00007407 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408 return NULL;
7409 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7410 if (str1 == NULL)
7411 return NULL;
7412 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007413 if (str2 == NULL) {
7414 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
7418 result = replace(self, str1, str2, maxcount);
7419
7420 Py_DECREF(str1);
7421 Py_DECREF(str2);
7422 return result;
7423}
7424
7425static
7426PyObject *unicode_repr(PyObject *unicode)
7427{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007428 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007429 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007430 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7431 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7432
7433 /* XXX(nnorwitz): rather than over-allocating, it would be
7434 better to choose a different scheme. Perhaps scan the
7435 first N-chars of the string and allocate based on that size.
7436 */
7437 /* Initial allocation is based on the longest-possible unichr
7438 escape.
7439
7440 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7441 unichr, so in this case it's the longest unichr escape. In
7442 narrow (UTF-16) builds this is five chars per source unichr
7443 since there are two unichrs in the surrogate pair, so in narrow
7444 (UTF-16) builds it's not the longest unichr escape.
7445
7446 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7447 so in the narrow (UTF-16) build case it's the longest unichr
7448 escape.
7449 */
7450
Walter Dörwald1ab83302007-05-18 17:15:44 +00007451 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007452 2 /* quotes */
7453#ifdef Py_UNICODE_WIDE
7454 + 10*size
7455#else
7456 + 6*size
7457#endif
7458 + 1);
7459 if (repr == NULL)
7460 return NULL;
7461
Walter Dörwald1ab83302007-05-18 17:15:44 +00007462 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007463
7464 /* Add quote */
7465 *p++ = (findchar(s, size, '\'') &&
7466 !findchar(s, size, '"')) ? '"' : '\'';
7467 while (size-- > 0) {
7468 Py_UNICODE ch = *s++;
7469
7470 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007471 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007472 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007473 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007474 continue;
7475 }
7476
7477#ifdef Py_UNICODE_WIDE
7478 /* Map 21-bit characters to '\U00xxxxxx' */
7479 else if (ch >= 0x10000) {
7480 *p++ = '\\';
7481 *p++ = 'U';
7482 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7483 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7484 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7485 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7486 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7487 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7488 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7489 *p++ = hexdigits[ch & 0x0000000F];
7490 continue;
7491 }
7492#else
7493 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7494 else if (ch >= 0xD800 && ch < 0xDC00) {
7495 Py_UNICODE ch2;
7496 Py_UCS4 ucs;
7497
7498 ch2 = *s++;
7499 size--;
7500 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7501 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7502 *p++ = '\\';
7503 *p++ = 'U';
7504 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7505 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7506 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7507 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7508 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7509 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7510 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7511 *p++ = hexdigits[ucs & 0x0000000F];
7512 continue;
7513 }
7514 /* Fall through: isolated surrogates are copied as-is */
7515 s--;
7516 size++;
7517 }
7518#endif
7519
7520 /* Map 16-bit characters to '\uxxxx' */
7521 if (ch >= 256) {
7522 *p++ = '\\';
7523 *p++ = 'u';
7524 *p++ = hexdigits[(ch >> 12) & 0x000F];
7525 *p++ = hexdigits[(ch >> 8) & 0x000F];
7526 *p++ = hexdigits[(ch >> 4) & 0x000F];
7527 *p++ = hexdigits[ch & 0x000F];
7528 }
7529
7530 /* Map special whitespace to '\t', \n', '\r' */
7531 else if (ch == '\t') {
7532 *p++ = '\\';
7533 *p++ = 't';
7534 }
7535 else if (ch == '\n') {
7536 *p++ = '\\';
7537 *p++ = 'n';
7538 }
7539 else if (ch == '\r') {
7540 *p++ = '\\';
7541 *p++ = 'r';
7542 }
7543
7544 /* Map non-printable US ASCII to '\xhh' */
7545 else if (ch < ' ' || ch >= 0x7F) {
7546 *p++ = '\\';
7547 *p++ = 'x';
7548 *p++ = hexdigits[(ch >> 4) & 0x000F];
7549 *p++ = hexdigits[ch & 0x000F];
7550 }
7551
7552 /* Copy everything else as-is */
7553 else
7554 *p++ = (char) ch;
7555 }
7556 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007557 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007558
7559 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007560 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007561 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562}
7563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007564PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565"S.rfind(sub [,start [,end]]) -> int\n\
7566\n\
7567Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007568such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569arguments start and end are interpreted as in slice notation.\n\
7570\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
7573static PyObject *
7574unicode_rfind(PyUnicodeObject *self, PyObject *args)
7575{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007576 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007577 Py_ssize_t start;
7578 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007579 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580
Christian Heimes9cd17752007-11-18 19:35:23 +00007581 if (!_ParseTupleFinds(args, &substring, &start, &end))
7582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583
Thomas Wouters477c8d52006-05-27 19:21:47 +00007584 result = stringlib_rfind_slice(
7585 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7586 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7587 start, end
7588 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589
7590 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007591
Christian Heimes217cfd12007-12-02 14:31:20 +00007592 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593}
7594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007595PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596"S.rindex(sub [,start [,end]]) -> int\n\
7597\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007598Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599
7600static PyObject *
7601unicode_rindex(PyUnicodeObject *self, PyObject *args)
7602{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007603 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007604 Py_ssize_t start;
7605 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007606 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607
Christian Heimes9cd17752007-11-18 19:35:23 +00007608 if (!_ParseTupleFinds(args, &substring, &start, &end))
7609 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610
Thomas Wouters477c8d52006-05-27 19:21:47 +00007611 result = stringlib_rfind_slice(
7612 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7613 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7614 start, end
7615 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616
7617 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007618
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619 if (result < 0) {
7620 PyErr_SetString(PyExc_ValueError, "substring not found");
7621 return NULL;
7622 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007623 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624}
7625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007626PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007627"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628\n\
7629Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007630done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
7632static PyObject *
7633unicode_rjust(PyUnicodeObject *self, PyObject *args)
7634{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007635 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007636 Py_UNICODE fillchar = ' ';
7637
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007638 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639 return NULL;
7640
Tim Peters7a29bd52001-09-12 03:03:31 +00007641 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 Py_INCREF(self);
7643 return (PyObject*) self;
7644 }
7645
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007646 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647}
7648
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649PyObject *PyUnicode_Split(PyObject *s,
7650 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007651 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652{
7653 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007654
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 s = PyUnicode_FromObject(s);
7656 if (s == NULL)
7657 return NULL;
7658 if (sep != NULL) {
7659 sep = PyUnicode_FromObject(sep);
7660 if (sep == NULL) {
7661 Py_DECREF(s);
7662 return NULL;
7663 }
7664 }
7665
7666 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7667
7668 Py_DECREF(s);
7669 Py_XDECREF(sep);
7670 return result;
7671}
7672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007673PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674"S.split([sep [,maxsplit]]) -> list of strings\n\
7675\n\
7676Return a list of the words in S, using sep as the\n\
7677delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007678splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007679any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680
7681static PyObject*
7682unicode_split(PyUnicodeObject *self, PyObject *args)
7683{
7684 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007685 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
Martin v. Löwis18e16552006-02-15 17:27:45 +00007687 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 return NULL;
7689
7690 if (substring == Py_None)
7691 return split(self, NULL, maxcount);
7692 else if (PyUnicode_Check(substring))
7693 return split(self, (PyUnicodeObject *)substring, maxcount);
7694 else
7695 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7696}
7697
Thomas Wouters477c8d52006-05-27 19:21:47 +00007698PyObject *
7699PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7700{
7701 PyObject* str_obj;
7702 PyObject* sep_obj;
7703 PyObject* out;
7704
7705 str_obj = PyUnicode_FromObject(str_in);
7706 if (!str_obj)
7707 return NULL;
7708 sep_obj = PyUnicode_FromObject(sep_in);
7709 if (!sep_obj) {
7710 Py_DECREF(str_obj);
7711 return NULL;
7712 }
7713
7714 out = stringlib_partition(
7715 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7716 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7717 );
7718
7719 Py_DECREF(sep_obj);
7720 Py_DECREF(str_obj);
7721
7722 return out;
7723}
7724
7725
7726PyObject *
7727PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7728{
7729 PyObject* str_obj;
7730 PyObject* sep_obj;
7731 PyObject* out;
7732
7733 str_obj = PyUnicode_FromObject(str_in);
7734 if (!str_obj)
7735 return NULL;
7736 sep_obj = PyUnicode_FromObject(sep_in);
7737 if (!sep_obj) {
7738 Py_DECREF(str_obj);
7739 return NULL;
7740 }
7741
7742 out = stringlib_rpartition(
7743 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7744 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7745 );
7746
7747 Py_DECREF(sep_obj);
7748 Py_DECREF(str_obj);
7749
7750 return out;
7751}
7752
7753PyDoc_STRVAR(partition__doc__,
7754"S.partition(sep) -> (head, sep, tail)\n\
7755\n\
7756Searches for the separator sep in S, and returns the part before it,\n\
7757the separator itself, and the part after it. If the separator is not\n\
7758found, returns S and two empty strings.");
7759
7760static PyObject*
7761unicode_partition(PyUnicodeObject *self, PyObject *separator)
7762{
7763 return PyUnicode_Partition((PyObject *)self, separator);
7764}
7765
7766PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007767"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007768\n\
7769Searches for the separator sep in S, starting at the end of S, and returns\n\
7770the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007771separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007772
7773static PyObject*
7774unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7775{
7776 return PyUnicode_RPartition((PyObject *)self, separator);
7777}
7778
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007779PyObject *PyUnicode_RSplit(PyObject *s,
7780 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007781 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007782{
7783 PyObject *result;
7784
7785 s = PyUnicode_FromObject(s);
7786 if (s == NULL)
7787 return NULL;
7788 if (sep != NULL) {
7789 sep = PyUnicode_FromObject(sep);
7790 if (sep == NULL) {
7791 Py_DECREF(s);
7792 return NULL;
7793 }
7794 }
7795
7796 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7797
7798 Py_DECREF(s);
7799 Py_XDECREF(sep);
7800 return result;
7801}
7802
7803PyDoc_STRVAR(rsplit__doc__,
7804"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7805\n\
7806Return a list of the words in S, using sep as the\n\
7807delimiter string, starting at the end of the string and\n\
7808working to the front. If maxsplit is given, at most maxsplit\n\
7809splits are done. If sep is not specified, any whitespace string\n\
7810is a separator.");
7811
7812static PyObject*
7813unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7814{
7815 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007816 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007817
Martin v. Löwis18e16552006-02-15 17:27:45 +00007818 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007819 return NULL;
7820
7821 if (substring == Py_None)
7822 return rsplit(self, NULL, maxcount);
7823 else if (PyUnicode_Check(substring))
7824 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7825 else
7826 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7827}
7828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007829PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007830"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831\n\
7832Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007833Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007834is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835
7836static PyObject*
7837unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7838{
Guido van Rossum86662912000-04-11 15:38:46 +00007839 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840
Guido van Rossum86662912000-04-11 15:38:46 +00007841 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 return NULL;
7843
Guido van Rossum86662912000-04-11 15:38:46 +00007844 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845}
7846
7847static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007848PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849{
Walter Dörwald346737f2007-05-31 10:44:43 +00007850 if (PyUnicode_CheckExact(self)) {
7851 Py_INCREF(self);
7852 return self;
7853 } else
7854 /* Subtype -- return genuine unicode string with the same value. */
7855 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7856 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857}
7858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007859PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860"S.swapcase() -> unicode\n\
7861\n\
7862Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007863and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864
7865static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007866unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 return fixup(self, fixswapcase);
7869}
7870
Georg Brandlceee0772007-11-27 23:48:05 +00007871PyDoc_STRVAR(maketrans__doc__,
7872"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
7873\n\
7874Return a translation table usable for str.translate().\n\
7875If there is only one argument, it must be a dictionary mapping Unicode\n\
7876ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
7877Character keys will then be converted to ordinals.\n\
7878If there are two arguments, they must be strings of equal length, and\n\
7879in the resulting dictionary, each character in x will be mapped to the\n\
7880character at the same position in y. If there is a third argument, it\n\
7881must be a string, whose characters will be mapped to None in the result.");
7882
7883static PyObject*
7884unicode_maketrans(PyUnicodeObject *null, PyObject *args)
7885{
7886 PyObject *x, *y = NULL, *z = NULL;
7887 PyObject *new = NULL, *key, *value;
7888 Py_ssize_t i = 0;
7889 int res;
7890
7891 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
7892 return NULL;
7893 new = PyDict_New();
7894 if (!new)
7895 return NULL;
7896 if (y != NULL) {
7897 /* x must be a string too, of equal length */
7898 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
7899 if (!PyUnicode_Check(x)) {
7900 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
7901 "be a string if there is a second argument");
7902 goto err;
7903 }
7904 if (PyUnicode_GET_SIZE(x) != ylen) {
7905 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
7906 "arguments must have equal length");
7907 goto err;
7908 }
7909 /* create entries for translating chars in x to those in y */
7910 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007911 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
7912 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007913 if (!key || !value)
7914 goto err;
7915 res = PyDict_SetItem(new, key, value);
7916 Py_DECREF(key);
7917 Py_DECREF(value);
7918 if (res < 0)
7919 goto err;
7920 }
7921 /* create entries for deleting chars in z */
7922 if (z != NULL) {
7923 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007924 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007925 if (!key)
7926 goto err;
7927 res = PyDict_SetItem(new, key, Py_None);
7928 Py_DECREF(key);
7929 if (res < 0)
7930 goto err;
7931 }
7932 }
7933 } else {
7934 /* x must be a dict */
7935 if (!PyDict_Check(x)) {
7936 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
7937 "to maketrans it must be a dict");
7938 goto err;
7939 }
7940 /* copy entries into the new dict, converting string keys to int keys */
7941 while (PyDict_Next(x, &i, &key, &value)) {
7942 if (PyUnicode_Check(key)) {
7943 /* convert string keys to integer keys */
7944 PyObject *newkey;
7945 if (PyUnicode_GET_SIZE(key) != 1) {
7946 PyErr_SetString(PyExc_ValueError, "string keys in translate "
7947 "table must be of length 1");
7948 goto err;
7949 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007950 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00007951 if (!newkey)
7952 goto err;
7953 res = PyDict_SetItem(new, newkey, value);
7954 Py_DECREF(newkey);
7955 if (res < 0)
7956 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00007957 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00007958 /* just keep integer keys */
7959 if (PyDict_SetItem(new, key, value) < 0)
7960 goto err;
7961 } else {
7962 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
7963 "be strings or integers");
7964 goto err;
7965 }
7966 }
7967 }
7968 return new;
7969 err:
7970 Py_DECREF(new);
7971 return NULL;
7972}
7973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007974PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975"S.translate(table) -> unicode\n\
7976\n\
7977Return a copy of the string S, where all characters have been mapped\n\
7978through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007979Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7980Unmapped characters are left untouched. Characters mapped to None\n\
7981are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982
7983static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007984unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985{
Georg Brandlceee0772007-11-27 23:48:05 +00007986 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987}
7988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007989PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990"S.upper() -> unicode\n\
7991\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007992Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993
7994static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007995unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 return fixup(self, fixupper);
7998}
7999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008000PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001"S.zfill(width) -> unicode\n\
8002\n\
8003Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008004of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005
8006static PyObject *
8007unicode_zfill(PyUnicodeObject *self, PyObject *args)
8008{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008009 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 PyUnicodeObject *u;
8011
Martin v. Löwis18e16552006-02-15 17:27:45 +00008012 Py_ssize_t width;
8013 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 return NULL;
8015
8016 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008017 if (PyUnicode_CheckExact(self)) {
8018 Py_INCREF(self);
8019 return (PyObject*) self;
8020 }
8021 else
8022 return PyUnicode_FromUnicode(
8023 PyUnicode_AS_UNICODE(self),
8024 PyUnicode_GET_SIZE(self)
8025 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 }
8027
8028 fill = width - self->length;
8029
8030 u = pad(self, fill, 0, '0');
8031
Walter Dörwald068325e2002-04-15 13:36:47 +00008032 if (u == NULL)
8033 return NULL;
8034
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 if (u->str[fill] == '+' || u->str[fill] == '-') {
8036 /* move sign to beginning of string */
8037 u->str[0] = u->str[fill];
8038 u->str[fill] = '0';
8039 }
8040
8041 return (PyObject*) u;
8042}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043
8044#if 0
8045static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008046unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047{
Christian Heimes2202f872008-02-06 14:31:34 +00008048 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049}
8050#endif
8051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008052PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008053"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008055Return True if S starts with the specified prefix, False otherwise.\n\
8056With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008057With optional end, stop comparing S at that position.\n\
8058prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059
8060static PyObject *
8061unicode_startswith(PyUnicodeObject *self,
8062 PyObject *args)
8063{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008064 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008066 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008067 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008068 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008070 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008071 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008073 if (PyTuple_Check(subobj)) {
8074 Py_ssize_t i;
8075 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8076 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8077 PyTuple_GET_ITEM(subobj, i));
8078 if (substring == NULL)
8079 return NULL;
8080 result = tailmatch(self, substring, start, end, -1);
8081 Py_DECREF(substring);
8082 if (result) {
8083 Py_RETURN_TRUE;
8084 }
8085 }
8086 /* nothing matched */
8087 Py_RETURN_FALSE;
8088 }
8089 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008091 return NULL;
8092 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008094 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095}
8096
8097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008098PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008099"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008101Return True if S ends with the specified suffix, False otherwise.\n\
8102With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008103With optional end, stop comparing S at that position.\n\
8104suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105
8106static PyObject *
8107unicode_endswith(PyUnicodeObject *self,
8108 PyObject *args)
8109{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008110 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008113 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008114 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008116 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8117 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008119 if (PyTuple_Check(subobj)) {
8120 Py_ssize_t i;
8121 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8122 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8123 PyTuple_GET_ITEM(subobj, i));
8124 if (substring == NULL)
8125 return NULL;
8126 result = tailmatch(self, substring, start, end, +1);
8127 Py_DECREF(substring);
8128 if (result) {
8129 Py_RETURN_TRUE;
8130 }
8131 }
8132 Py_RETURN_FALSE;
8133 }
8134 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008138 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008140 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141}
8142
Eric Smith8c663262007-08-25 02:26:07 +00008143#include "stringlib/string_format.h"
8144
8145PyDoc_STRVAR(format__doc__,
8146"S.format(*args, **kwargs) -> unicode\n\
8147\n\
8148");
8149
Eric Smith8c663262007-08-25 02:26:07 +00008150PyDoc_STRVAR(p_format__doc__,
8151"S.__format__(format_spec) -> unicode\n\
8152\n\
8153");
8154
8155static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008156unicode_getnewargs(PyUnicodeObject *v)
8157{
8158 return Py_BuildValue("(u#)", v->str, v->length);
8159}
8160
8161
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162static PyMethodDef unicode_methods[] = {
8163
8164 /* Order is according to common usage: often used methods should
8165 appear first, since lookup is done sequentially. */
8166
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008167 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8168 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8169 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008170 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008171 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8172 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8173 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8174 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8175 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8176 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8177 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008178 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008179 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8180 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8181 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008182 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008183 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8184 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8185 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008186 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008187 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008188 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008189 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008190 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8191 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8192 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8193 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8194 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8195 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8196 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8197 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8198 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8199 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8200 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8201 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8202 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8203 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008204 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008205 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008206 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8207 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008208 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8209 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008210 {"maketrans", (PyCFunction) unicode_maketrans,
8211 METH_VARARGS | METH_STATIC, maketrans__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008212#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008213 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214#endif
8215
8216#if 0
8217 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008218 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219#endif
8220
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008221 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 {NULL, NULL}
8223};
8224
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008225static PyObject *
8226unicode_mod(PyObject *v, PyObject *w)
8227{
8228 if (!PyUnicode_Check(v)) {
8229 Py_INCREF(Py_NotImplemented);
8230 return Py_NotImplemented;
8231 }
8232 return PyUnicode_Format(v, w);
8233}
8234
8235static PyNumberMethods unicode_as_number = {
8236 0, /*nb_add*/
8237 0, /*nb_subtract*/
8238 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008239 unicode_mod, /*nb_remainder*/
8240};
8241
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008243 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008244 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008245 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8246 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008247 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 0, /* sq_ass_item */
8249 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008250 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251};
8252
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008253static PyObject*
8254unicode_subscript(PyUnicodeObject* self, PyObject* item)
8255{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008256 if (PyIndex_Check(item)) {
8257 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008258 if (i == -1 && PyErr_Occurred())
8259 return NULL;
8260 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008261 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008262 return unicode_getitem(self, i);
8263 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008264 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008265 Py_UNICODE* source_buf;
8266 Py_UNICODE* result_buf;
8267 PyObject* result;
8268
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008269 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008270 &start, &stop, &step, &slicelength) < 0) {
8271 return NULL;
8272 }
8273
8274 if (slicelength <= 0) {
8275 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008276 } else if (start == 0 && step == 1 && slicelength == self->length &&
8277 PyUnicode_CheckExact(self)) {
8278 Py_INCREF(self);
8279 return (PyObject *)self;
8280 } else if (step == 1) {
8281 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008282 } else {
8283 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008284 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8285 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008286
8287 if (result_buf == NULL)
8288 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008289
8290 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8291 result_buf[i] = source_buf[cur];
8292 }
Tim Petersced69f82003-09-16 20:30:58 +00008293
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008294 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008295 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008296 return result;
8297 }
8298 } else {
8299 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8300 return NULL;
8301 }
8302}
8303
8304static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008305 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008306 (binaryfunc)unicode_subscript, /* mp_subscript */
8307 (objobjargproc)0, /* mp_ass_subscript */
8308};
8309
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311/* Helpers for PyUnicode_Format() */
8312
8313static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008314getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008316 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 if (argidx < arglen) {
8318 (*p_argidx)++;
8319 if (arglen < 0)
8320 return args;
8321 else
8322 return PyTuple_GetItem(args, argidx);
8323 }
8324 PyErr_SetString(PyExc_TypeError,
8325 "not enough arguments for format string");
8326 return NULL;
8327}
8328
Martin v. Löwis18e16552006-02-15 17:27:45 +00008329static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008330strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008332 register Py_ssize_t i;
8333 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 for (i = len - 1; i >= 0; i--)
8335 buffer[i] = (Py_UNICODE) charbuffer[i];
8336
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 return len;
8338}
8339
Neal Norwitzfc76d632006-01-10 06:03:13 +00008340static int
8341doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8342{
Tim Peters15231542006-02-16 01:08:01 +00008343 Py_ssize_t result;
8344
Neal Norwitzfc76d632006-01-10 06:03:13 +00008345 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008346 result = strtounicode(buffer, (char *)buffer);
8347 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008348}
8349
8350static int
8351longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8352{
Tim Peters15231542006-02-16 01:08:01 +00008353 Py_ssize_t result;
8354
Neal Norwitzfc76d632006-01-10 06:03:13 +00008355 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008356 result = strtounicode(buffer, (char *)buffer);
8357 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008358}
8359
Guido van Rossum078151d2002-08-11 04:24:12 +00008360/* XXX To save some code duplication, formatfloat/long/int could have been
8361 shared with stringobject.c, converting from 8-bit to Unicode after the
8362 formatting is done. */
8363
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364static int
8365formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008366 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 int flags,
8368 int prec,
8369 int type,
8370 PyObject *v)
8371{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008372 /* fmt = '%#.' + `prec` + `type`
8373 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 char fmt[20];
8375 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008376
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 x = PyFloat_AsDouble(v);
8378 if (x == -1.0 && PyErr_Occurred())
8379 return -1;
8380 if (prec < 0)
8381 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8383 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008384 /* Worst case length calc to ensure no buffer overrun:
8385
8386 'g' formats:
8387 fmt = %#.<prec>g
8388 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8389 for any double rep.)
8390 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8391
8392 'f' formats:
8393 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8394 len = 1 + 50 + 1 + prec = 52 + prec
8395
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008396 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008397 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008398
8399 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008400 if (((type == 'g' || type == 'G') &&
8401 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008402 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008403 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008404 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008405 return -1;
8406 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008407 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8408 (flags&F_ALT) ? "#" : "",
8409 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008410 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411}
8412
Tim Peters38fd5b62000-09-21 05:43:11 +00008413static PyObject*
8414formatlong(PyObject *val, int flags, int prec, int type)
8415{
8416 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008417 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008418 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008419 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008420
8421 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8422 if (!str)
8423 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008424 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008425 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008426 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008427}
8428
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429static int
8430formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008431 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 int flags,
8433 int prec,
8434 int type,
8435 PyObject *v)
8436{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008437 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008438 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8439 * + 1 + 1
8440 * = 24
8441 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008442 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008443 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 long x;
8445
Christian Heimes217cfd12007-12-02 14:31:20 +00008446 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008448 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008449 if (x < 0 && type == 'u') {
8450 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008451 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008452 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8453 sign = "-";
8454 else
8455 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008457 prec = 1;
8458
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008459 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8460 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008461 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008462 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008463 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008464 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008465 return -1;
8466 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008467
8468 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008469 (type == 'x' || type == 'X' || type == 'o')) {
8470 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008471 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008472 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008473 * - when 0 is being converted, the C standard leaves off
8474 * the '0x' or '0X', which is inconsistent with other
8475 * %#x/%#X conversions and inconsistent with Python's
8476 * hex() function
8477 * - there are platforms that violate the standard and
8478 * convert 0 with the '0x' or '0X'
8479 * (Metrowerks, Compaq Tru64)
8480 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008481 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008482 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008483 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008484 * We can achieve the desired consistency by inserting our
8485 * own '0x' or '0X' prefix, and substituting %x/%X in place
8486 * of %#x/%#X.
8487 *
8488 * Note that this is the same approach as used in
8489 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008490 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008491 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8492 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008493 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008494 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008495 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8496 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008497 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008498 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008499 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008500 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008501 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008502 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503}
8504
8505static int
8506formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008507 size_t buflen,
8508 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008510 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008511 if (PyUnicode_Check(v)) {
8512 if (PyUnicode_GET_SIZE(v) != 1)
8513 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 else {
8517 /* Integer input truncated to a character */
8518 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008519 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008521 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008522#ifdef Py_UNICODE_WIDE
8523 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008524 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008525 "%c arg not in range(0x110000) "
8526 "(wide Python build)");
8527 return -1;
8528 }
8529#else
8530 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008531 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008532 "%c arg not in range(0x10000) "
8533 "(narrow Python build)");
8534 return -1;
8535 }
8536#endif
8537 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 }
8539 buf[1] = '\0';
8540 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008541
8542 onError:
8543 PyErr_SetString(PyExc_TypeError,
8544 "%c requires int or char");
8545 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546}
8547
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008548/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8549
8550 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8551 chars are formatted. XXX This is a magic number. Each formatting
8552 routine does bounds checking to ensure no overflow, but a better
8553 solution may be to malloc a buffer of appropriate size for each
8554 format. For now, the current solution is sufficient.
8555*/
8556#define FORMATBUFLEN (size_t)120
8557
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558PyObject *PyUnicode_Format(PyObject *format,
8559 PyObject *args)
8560{
8561 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008562 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 int args_owned = 0;
8564 PyUnicodeObject *result = NULL;
8565 PyObject *dict = NULL;
8566 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008567
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 if (format == NULL || args == NULL) {
8569 PyErr_BadInternalCall();
8570 return NULL;
8571 }
8572 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008573 if (uformat == NULL)
8574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 fmt = PyUnicode_AS_UNICODE(uformat);
8576 fmtcnt = PyUnicode_GET_SIZE(uformat);
8577
8578 reslen = rescnt = fmtcnt + 100;
8579 result = _PyUnicode_New(reslen);
8580 if (result == NULL)
8581 goto onError;
8582 res = PyUnicode_AS_UNICODE(result);
8583
8584 if (PyTuple_Check(args)) {
8585 arglen = PyTuple_Size(args);
8586 argidx = 0;
8587 }
8588 else {
8589 arglen = -1;
8590 argidx = -2;
8591 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008592 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008593 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 dict = args;
8595
8596 while (--fmtcnt >= 0) {
8597 if (*fmt != '%') {
8598 if (--rescnt < 0) {
8599 rescnt = fmtcnt + 100;
8600 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008601 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008602 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8604 --rescnt;
8605 }
8606 *res++ = *fmt++;
8607 }
8608 else {
8609 /* Got a format specifier */
8610 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008611 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 Py_UNICODE c = '\0';
8614 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008615 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 PyObject *v = NULL;
8617 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008618 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008620 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008621 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
8623 fmt++;
8624 if (*fmt == '(') {
8625 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008626 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 PyObject *key;
8628 int pcount = 1;
8629
8630 if (dict == NULL) {
8631 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008632 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 goto onError;
8634 }
8635 ++fmt;
8636 --fmtcnt;
8637 keystart = fmt;
8638 /* Skip over balanced parentheses */
8639 while (pcount > 0 && --fmtcnt >= 0) {
8640 if (*fmt == ')')
8641 --pcount;
8642 else if (*fmt == '(')
8643 ++pcount;
8644 fmt++;
8645 }
8646 keylen = fmt - keystart - 1;
8647 if (fmtcnt < 0 || pcount > 0) {
8648 PyErr_SetString(PyExc_ValueError,
8649 "incomplete format key");
8650 goto onError;
8651 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008652#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008653 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 then looked up since Python uses strings to hold
8655 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008656 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 key = PyUnicode_EncodeUTF8(keystart,
8658 keylen,
8659 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008660#else
8661 key = PyUnicode_FromUnicode(keystart, keylen);
8662#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 if (key == NULL)
8664 goto onError;
8665 if (args_owned) {
8666 Py_DECREF(args);
8667 args_owned = 0;
8668 }
8669 args = PyObject_GetItem(dict, key);
8670 Py_DECREF(key);
8671 if (args == NULL) {
8672 goto onError;
8673 }
8674 args_owned = 1;
8675 arglen = -1;
8676 argidx = -2;
8677 }
8678 while (--fmtcnt >= 0) {
8679 switch (c = *fmt++) {
8680 case '-': flags |= F_LJUST; continue;
8681 case '+': flags |= F_SIGN; continue;
8682 case ' ': flags |= F_BLANK; continue;
8683 case '#': flags |= F_ALT; continue;
8684 case '0': flags |= F_ZERO; continue;
8685 }
8686 break;
8687 }
8688 if (c == '*') {
8689 v = getnextarg(args, arglen, &argidx);
8690 if (v == NULL)
8691 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008692 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 PyErr_SetString(PyExc_TypeError,
8694 "* wants int");
8695 goto onError;
8696 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008697 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008698 if (width == -1 && PyErr_Occurred())
8699 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 if (width < 0) {
8701 flags |= F_LJUST;
8702 width = -width;
8703 }
8704 if (--fmtcnt >= 0)
8705 c = *fmt++;
8706 }
8707 else if (c >= '0' && c <= '9') {
8708 width = c - '0';
8709 while (--fmtcnt >= 0) {
8710 c = *fmt++;
8711 if (c < '0' || c > '9')
8712 break;
8713 if ((width*10) / 10 != width) {
8714 PyErr_SetString(PyExc_ValueError,
8715 "width too big");
8716 goto onError;
8717 }
8718 width = width*10 + (c - '0');
8719 }
8720 }
8721 if (c == '.') {
8722 prec = 0;
8723 if (--fmtcnt >= 0)
8724 c = *fmt++;
8725 if (c == '*') {
8726 v = getnextarg(args, arglen, &argidx);
8727 if (v == NULL)
8728 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008729 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 PyErr_SetString(PyExc_TypeError,
8731 "* wants int");
8732 goto onError;
8733 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008734 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008735 if (prec == -1 && PyErr_Occurred())
8736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 if (prec < 0)
8738 prec = 0;
8739 if (--fmtcnt >= 0)
8740 c = *fmt++;
8741 }
8742 else if (c >= '0' && c <= '9') {
8743 prec = c - '0';
8744 while (--fmtcnt >= 0) {
8745 c = Py_CHARMASK(*fmt++);
8746 if (c < '0' || c > '9')
8747 break;
8748 if ((prec*10) / 10 != prec) {
8749 PyErr_SetString(PyExc_ValueError,
8750 "prec too big");
8751 goto onError;
8752 }
8753 prec = prec*10 + (c - '0');
8754 }
8755 }
8756 } /* prec */
8757 if (fmtcnt >= 0) {
8758 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 if (--fmtcnt >= 0)
8760 c = *fmt++;
8761 }
8762 }
8763 if (fmtcnt < 0) {
8764 PyErr_SetString(PyExc_ValueError,
8765 "incomplete format");
8766 goto onError;
8767 }
8768 if (c != '%') {
8769 v = getnextarg(args, arglen, &argidx);
8770 if (v == NULL)
8771 goto onError;
8772 }
8773 sign = 0;
8774 fill = ' ';
8775 switch (c) {
8776
8777 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008778 pbuf = formatbuf;
8779 /* presume that buffer length is at least 1 */
8780 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 len = 1;
8782 break;
8783
8784 case 's':
8785 case 'r':
8786 if (PyUnicode_Check(v) && c == 's') {
8787 temp = v;
8788 Py_INCREF(temp);
8789 }
8790 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008792 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 else
8794 temp = PyObject_Repr(v);
8795 if (temp == NULL)
8796 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008797 if (PyUnicode_Check(temp))
8798 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008799 else {
8800 Py_DECREF(temp);
8801 PyErr_SetString(PyExc_TypeError,
8802 "%s argument has non-string str()");
8803 goto onError;
8804 }
8805 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008806 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 len = PyUnicode_GET_SIZE(temp);
8808 if (prec >= 0 && len > prec)
8809 len = prec;
8810 break;
8811
8812 case 'i':
8813 case 'd':
8814 case 'u':
8815 case 'o':
8816 case 'x':
8817 case 'X':
8818 if (c == 'i')
8819 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00008820 isnumok = 0;
8821 if (PyNumber_Check(v)) {
8822 PyObject *iobj=NULL;
8823
8824 if (PyLong_Check(v)) {
8825 iobj = v;
8826 Py_INCREF(iobj);
8827 }
8828 else {
8829 iobj = PyNumber_Long(v);
8830 }
8831 if (iobj!=NULL) {
8832 if (PyLong_Check(iobj)) {
8833 isnumok = 1;
8834 temp = formatlong(iobj, flags, prec, c);
8835 Py_DECREF(iobj);
8836 if (!temp)
8837 goto onError;
8838 pbuf = PyUnicode_AS_UNICODE(temp);
8839 len = PyUnicode_GET_SIZE(temp);
8840 sign = 1;
8841 }
8842 else {
8843 Py_DECREF(iobj);
8844 }
8845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 }
Christian Heimesa612dc02008-02-24 13:08:18 +00008847 if (!isnumok) {
8848 PyErr_Format(PyExc_TypeError,
8849 "%%%c format: a number is required, "
8850 "not %.200s", c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008851 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008852 }
8853 if (flags & F_ZERO)
8854 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 break;
8856
8857 case 'e':
8858 case 'E':
8859 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008860 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 case 'g':
8862 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008863 if (c == 'F')
8864 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008865 pbuf = formatbuf;
8866 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8867 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 if (len < 0)
8869 goto onError;
8870 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008871 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 fill = '0';
8873 break;
8874
8875 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008876 pbuf = formatbuf;
8877 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 if (len < 0)
8879 goto onError;
8880 break;
8881
8882 default:
8883 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008884 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008885 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008886 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008887 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008888 (Py_ssize_t)(fmt - 1 -
8889 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 goto onError;
8891 }
8892 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008893 if (*pbuf == '-' || *pbuf == '+') {
8894 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 len--;
8896 }
8897 else if (flags & F_SIGN)
8898 sign = '+';
8899 else if (flags & F_BLANK)
8900 sign = ' ';
8901 else
8902 sign = 0;
8903 }
8904 if (width < len)
8905 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008906 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907 reslen -= rescnt;
8908 rescnt = width + fmtcnt + 100;
8909 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008910 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008911 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008912 PyErr_NoMemory();
8913 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008914 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008915 if (_PyUnicode_Resize(&result, reslen) < 0) {
8916 Py_XDECREF(temp);
8917 goto onError;
8918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919 res = PyUnicode_AS_UNICODE(result)
8920 + reslen - rescnt;
8921 }
8922 if (sign) {
8923 if (fill != ' ')
8924 *res++ = sign;
8925 rescnt--;
8926 if (width > len)
8927 width--;
8928 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008929 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008930 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008931 assert(pbuf[1] == c);
8932 if (fill != ' ') {
8933 *res++ = *pbuf++;
8934 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008935 }
Tim Petersfff53252001-04-12 18:38:48 +00008936 rescnt -= 2;
8937 width -= 2;
8938 if (width < 0)
8939 width = 0;
8940 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 if (width > len && !(flags & F_LJUST)) {
8943 do {
8944 --rescnt;
8945 *res++ = fill;
8946 } while (--width > len);
8947 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008948 if (fill == ' ') {
8949 if (sign)
8950 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008951 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008952 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008953 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008954 *res++ = *pbuf++;
8955 *res++ = *pbuf++;
8956 }
8957 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008958 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 res += len;
8960 rescnt -= len;
8961 while (--width >= len) {
8962 --rescnt;
8963 *res++ = ' ';
8964 }
8965 if (dict && (argidx < arglen) && c != '%') {
8966 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008967 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008968 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 goto onError;
8970 }
8971 Py_XDECREF(temp);
8972 } /* '%' */
8973 } /* until end */
8974 if (argidx < arglen && !dict) {
8975 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008976 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 goto onError;
8978 }
8979
Thomas Woutersa96affe2006-03-12 00:29:36 +00008980 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8981 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 if (args_owned) {
8983 Py_DECREF(args);
8984 }
8985 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 return (PyObject *)result;
8987
8988 onError:
8989 Py_XDECREF(result);
8990 Py_DECREF(uformat);
8991 if (args_owned) {
8992 Py_DECREF(args);
8993 }
8994 return NULL;
8995}
8996
Jeremy Hylton938ace62002-07-17 16:30:39 +00008997static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008998unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8999
Tim Peters6d6c1a32001-08-02 04:15:00 +00009000static PyObject *
9001unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9002{
9003 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009004 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009005 char *encoding = NULL;
9006 char *errors = NULL;
9007
Guido van Rossume023fe02001-08-30 03:12:59 +00009008 if (type != &PyUnicode_Type)
9009 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009010 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
9011 kwlist, &x, &encoding, &errors))
9012 return NULL;
9013 if (x == NULL)
9014 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009015 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009016 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009017 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009018 return PyUnicode_FromEncodedObject(x, encoding, errors);
9019}
9020
Guido van Rossume023fe02001-08-30 03:12:59 +00009021static PyObject *
9022unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9023{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009024 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009025 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009026
9027 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9028 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9029 if (tmp == NULL)
9030 return NULL;
9031 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009032 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009033 if (pnew == NULL) {
9034 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009035 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009036 }
Christian Heimesb186d002008-03-18 15:15:01 +00009037 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009038 if (pnew->str == NULL) {
9039 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009040 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009041 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009042 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009043 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009044 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9045 pnew->length = n;
9046 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009047 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009048 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009049}
9050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009051PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00009052"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009053\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009054Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009055encoding defaults to the current default string encoding.\n\
9056errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009057
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009058static PyObject *unicode_iter(PyObject *seq);
9059
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009061 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009062 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063 sizeof(PyUnicodeObject), /* tp_size */
9064 0, /* tp_itemsize */
9065 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009066 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009068 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009070 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009071 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009072 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009074 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075 (hashfunc) unicode_hash, /* tp_hash*/
9076 0, /* tp_call*/
9077 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009078 PyObject_GenericGetAttr, /* tp_getattro */
9079 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009080 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009081 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9082 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009083 unicode_doc, /* tp_doc */
9084 0, /* tp_traverse */
9085 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009086 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009087 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009088 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009089 0, /* tp_iternext */
9090 unicode_methods, /* tp_methods */
9091 0, /* tp_members */
9092 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009093 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009094 0, /* tp_dict */
9095 0, /* tp_descr_get */
9096 0, /* tp_descr_set */
9097 0, /* tp_dictoffset */
9098 0, /* tp_init */
9099 0, /* tp_alloc */
9100 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009101 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102};
9103
9104/* Initialize the Unicode implementation */
9105
Thomas Wouters78890102000-07-22 19:25:51 +00009106void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009108 int i;
9109
Thomas Wouters477c8d52006-05-27 19:21:47 +00009110 /* XXX - move this array to unicodectype.c ? */
9111 Py_UNICODE linebreak[] = {
9112 0x000A, /* LINE FEED */
9113 0x000D, /* CARRIAGE RETURN */
9114 0x001C, /* FILE SEPARATOR */
9115 0x001D, /* GROUP SEPARATOR */
9116 0x001E, /* RECORD SEPARATOR */
9117 0x0085, /* NEXT LINE */
9118 0x2028, /* LINE SEPARATOR */
9119 0x2029, /* PARAGRAPH SEPARATOR */
9120 };
9121
Fred Drakee4315f52000-05-09 19:53:39 +00009122 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009123 free_list = NULL;
9124 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009126 if (!unicode_empty)
9127 return;
9128
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009129 for (i = 0; i < 256; i++)
9130 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009131 if (PyType_Ready(&PyUnicode_Type) < 0)
9132 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009133
9134 /* initialize the linebreak bloom filter */
9135 bloom_linebreak = make_bloom_mask(
9136 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9137 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009138
9139 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140}
9141
9142/* Finalize the Unicode implementation */
9143
Christian Heimesa156e092008-02-16 07:38:31 +00009144int
9145PyUnicode_ClearFreeList(void)
9146{
9147 int freelist_size = numfree;
9148 PyUnicodeObject *u;
9149
9150 for (u = free_list; u != NULL;) {
9151 PyUnicodeObject *v = u;
9152 u = *(PyUnicodeObject **)u;
9153 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009154 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009155 Py_XDECREF(v->defenc);
9156 PyObject_Del(v);
9157 numfree--;
9158 }
9159 free_list = NULL;
9160 assert(numfree == 0);
9161 return freelist_size;
9162}
9163
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164void
Thomas Wouters78890102000-07-22 19:25:51 +00009165_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009167 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009169 Py_XDECREF(unicode_empty);
9170 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009171
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009172 for (i = 0; i < 256; i++) {
9173 if (unicode_latin1[i]) {
9174 Py_DECREF(unicode_latin1[i]);
9175 unicode_latin1[i] = NULL;
9176 }
9177 }
Christian Heimesa156e092008-02-16 07:38:31 +00009178 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009180
Walter Dörwald16807132007-05-25 13:52:07 +00009181void
9182PyUnicode_InternInPlace(PyObject **p)
9183{
9184 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9185 PyObject *t;
9186 if (s == NULL || !PyUnicode_Check(s))
9187 Py_FatalError(
9188 "PyUnicode_InternInPlace: unicode strings only please!");
9189 /* If it's a subclass, we don't really know what putting
9190 it in the interned dict might do. */
9191 if (!PyUnicode_CheckExact(s))
9192 return;
9193 if (PyUnicode_CHECK_INTERNED(s))
9194 return;
9195 if (interned == NULL) {
9196 interned = PyDict_New();
9197 if (interned == NULL) {
9198 PyErr_Clear(); /* Don't leave an exception */
9199 return;
9200 }
9201 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009202 /* It might be that the GetItem call fails even
9203 though the key is present in the dictionary,
9204 namely when this happens during a stack overflow. */
9205 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009206 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009207 Py_END_ALLOW_RECURSION
9208
Walter Dörwald16807132007-05-25 13:52:07 +00009209 if (t) {
9210 Py_INCREF(t);
9211 Py_DECREF(*p);
9212 *p = t;
9213 return;
9214 }
9215
Martin v. Löwis5b222132007-06-10 09:51:05 +00009216 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009217 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9218 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009219 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009220 return;
9221 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009222 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009223 /* The two references in interned are not counted by refcnt.
9224 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009225 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009226 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9227}
9228
9229void
9230PyUnicode_InternImmortal(PyObject **p)
9231{
9232 PyUnicode_InternInPlace(p);
9233 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9234 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9235 Py_INCREF(*p);
9236 }
9237}
9238
9239PyObject *
9240PyUnicode_InternFromString(const char *cp)
9241{
9242 PyObject *s = PyUnicode_FromString(cp);
9243 if (s == NULL)
9244 return NULL;
9245 PyUnicode_InternInPlace(&s);
9246 return s;
9247}
9248
9249void _Py_ReleaseInternedUnicodeStrings(void)
9250{
9251 PyObject *keys;
9252 PyUnicodeObject *s;
9253 Py_ssize_t i, n;
9254 Py_ssize_t immortal_size = 0, mortal_size = 0;
9255
9256 if (interned == NULL || !PyDict_Check(interned))
9257 return;
9258 keys = PyDict_Keys(interned);
9259 if (keys == NULL || !PyList_Check(keys)) {
9260 PyErr_Clear();
9261 return;
9262 }
9263
9264 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9265 detector, interned unicode strings are not forcibly deallocated;
9266 rather, we give them their stolen references back, and then clear
9267 and DECREF the interned dict. */
9268
9269 n = PyList_GET_SIZE(keys);
9270 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9271 n);
9272 for (i = 0; i < n; i++) {
9273 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9274 switch (s->state) {
9275 case SSTATE_NOT_INTERNED:
9276 /* XXX Shouldn't happen */
9277 break;
9278 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009279 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009280 immortal_size += s->length;
9281 break;
9282 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009283 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009284 mortal_size += s->length;
9285 break;
9286 default:
9287 Py_FatalError("Inconsistent interned string state.");
9288 }
9289 s->state = SSTATE_NOT_INTERNED;
9290 }
9291 fprintf(stderr, "total size of all interned strings: "
9292 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9293 "mortal/immortal\n", mortal_size, immortal_size);
9294 Py_DECREF(keys);
9295 PyDict_Clear(interned);
9296 Py_DECREF(interned);
9297 interned = NULL;
9298}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009299
9300
9301/********************* Unicode Iterator **************************/
9302
9303typedef struct {
9304 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009305 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009306 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9307} unicodeiterobject;
9308
9309static void
9310unicodeiter_dealloc(unicodeiterobject *it)
9311{
9312 _PyObject_GC_UNTRACK(it);
9313 Py_XDECREF(it->it_seq);
9314 PyObject_GC_Del(it);
9315}
9316
9317static int
9318unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9319{
9320 Py_VISIT(it->it_seq);
9321 return 0;
9322}
9323
9324static PyObject *
9325unicodeiter_next(unicodeiterobject *it)
9326{
9327 PyUnicodeObject *seq;
9328 PyObject *item;
9329
9330 assert(it != NULL);
9331 seq = it->it_seq;
9332 if (seq == NULL)
9333 return NULL;
9334 assert(PyUnicode_Check(seq));
9335
9336 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009337 item = PyUnicode_FromUnicode(
9338 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009339 if (item != NULL)
9340 ++it->it_index;
9341 return item;
9342 }
9343
9344 Py_DECREF(seq);
9345 it->it_seq = NULL;
9346 return NULL;
9347}
9348
9349static PyObject *
9350unicodeiter_len(unicodeiterobject *it)
9351{
9352 Py_ssize_t len = 0;
9353 if (it->it_seq)
9354 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009355 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009356}
9357
9358PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9359
9360static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009361 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9362 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009363 {NULL, NULL} /* sentinel */
9364};
9365
9366PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009367 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009368 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009369 sizeof(unicodeiterobject), /* tp_basicsize */
9370 0, /* tp_itemsize */
9371 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009372 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009373 0, /* tp_print */
9374 0, /* tp_getattr */
9375 0, /* tp_setattr */
9376 0, /* tp_compare */
9377 0, /* tp_repr */
9378 0, /* tp_as_number */
9379 0, /* tp_as_sequence */
9380 0, /* tp_as_mapping */
9381 0, /* tp_hash */
9382 0, /* tp_call */
9383 0, /* tp_str */
9384 PyObject_GenericGetAttr, /* tp_getattro */
9385 0, /* tp_setattro */
9386 0, /* tp_as_buffer */
9387 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9388 0, /* tp_doc */
9389 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9390 0, /* tp_clear */
9391 0, /* tp_richcompare */
9392 0, /* tp_weaklistoffset */
9393 PyObject_SelfIter, /* tp_iter */
9394 (iternextfunc)unicodeiter_next, /* tp_iternext */
9395 unicodeiter_methods, /* tp_methods */
9396 0,
9397};
9398
9399static PyObject *
9400unicode_iter(PyObject *seq)
9401{
9402 unicodeiterobject *it;
9403
9404 if (!PyUnicode_Check(seq)) {
9405 PyErr_BadInternalCall();
9406 return NULL;
9407 }
9408 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9409 if (it == NULL)
9410 return NULL;
9411 it->it_index = 0;
9412 Py_INCREF(seq);
9413 it->it_seq = (PyUnicodeObject *)seq;
9414 _PyObject_GC_TRACK(it);
9415 return (PyObject *)it;
9416}
9417
Martin v. Löwis5b222132007-06-10 09:51:05 +00009418size_t
9419Py_UNICODE_strlen(const Py_UNICODE *u)
9420{
9421 int res = 0;
9422 while(*u++)
9423 res++;
9424 return res;
9425}
9426
9427Py_UNICODE*
9428Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9429{
9430 Py_UNICODE *u = s1;
9431 while ((*u++ = *s2++));
9432 return s1;
9433}
9434
9435Py_UNICODE*
9436Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9437{
9438 Py_UNICODE *u = s1;
9439 while ((*u++ = *s2++))
9440 if (n-- == 0)
9441 break;
9442 return s1;
9443}
9444
9445int
9446Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9447{
9448 while (*s1 && *s2 && *s1 == *s2)
9449 s1++, s2++;
9450 if (*s1 && *s2)
9451 return (*s1 < *s2) ? -1 : +1;
9452 if (*s1)
9453 return 1;
9454 if (*s2)
9455 return -1;
9456 return 0;
9457}
9458
9459Py_UNICODE*
9460Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9461{
9462 const Py_UNICODE *p;
9463 for (p = s; *p; p++)
9464 if (*p == c)
9465 return (Py_UNICODE*)p;
9466 return NULL;
9467}
9468
9469
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009470#ifdef __cplusplus
9471}
9472#endif
9473
9474
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009475/*
9476Local variables:
9477c-basic-offset: 4
9478indent-tabs-mode: nil
9479End:
9480*/