blob: fe49ec8dbe417588ff6d388f539b8747eafb4d82 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
Christian Heimes2202f872008-02-06 14:31:34 +000057#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Christian Heimes2202f872008-02-06 14:31:34 +000065 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000109static PyUnicodeObject *free_list;
110static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes190d79e2008-01-30 11:58:22 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131// case 0x0009: /* HORIZONTAL TABULATION */
132// case 0x000A: /* LINE FEED */
133// case 0x000B: /* VERTICAL TABULATION */
134// case 0x000C: /* FORM FEED */
135// case 0x000D: /* CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138// case 0x001C: /* FILE SEPARATOR */
139// case 0x001D: /* GROUP SEPARATOR */
140// case 0x001E: /* RECORD SEPARATOR */
141// case 0x001F: /* UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143// case 0x0020: /* SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162// 0x000A, /* LINE FEED */
163// 0x000D, /* CARRIAGE RETURN */
164 0, 0, 1, 0, 0, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166// 0x001C, /* FILE SEPARATOR */
167// 0x001D, /* GROUP SEPARATOR */
168// 0x001E, /* RECORD SEPARATOR */
169 0, 0, 0, 0, 1, 1, 1, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0
183};
184
185
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000187PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000189#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190 return 0x10FFFF;
191#else
192 /* This is actually an illegal character, so it should
193 not be passed to unichr. */
194 return 0xFFFF;
195#endif
196}
197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198/* --- Bloom Filters ----------------------------------------------------- */
199
200/* stuff to implement simple "bloom filters" for Unicode characters.
201 to keep things simple, we use a single bitmask, using the least 5
202 bits from each unicode characters as the bit index. */
203
204/* the linebreak mask is set up by Unicode_Init below */
205
206#define BLOOM_MASK unsigned long
207
208static BLOOM_MASK bloom_linebreak;
209
210#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
211
Christian Heimes190d79e2008-01-30 11:58:22 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
220 long mask;
221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
225 mask |= (1 << (ptr[i] & 0x1F));
226
227 return mask;
228}
229
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
238 return 0;
239}
240
241#define BLOOM_MEMBER(mask, chr, set, setlen)\
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 if (unicode == unicode_empty ||
261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000297 Ux0000 terminated; some code (e.g. new_identifier)
298 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299
300 XXX This allocator could further be enhanced by assuring that the
301 free list never reduces its size below 1.
302
303*/
304
305static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000306PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307{
308 register PyUnicodeObject *unicode;
309
Thomas Wouters477c8d52006-05-27 19:21:47 +0000310 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 if (length == 0 && unicode_empty != NULL) {
312 Py_INCREF(unicode_empty);
313 return unicode_empty;
314 }
315
316 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000317 if (free_list) {
318 unicode = free_list;
319 free_list = *(PyUnicodeObject **)unicode;
320 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000322 /* Keep-Alive optimization: we only upsize the buffer,
323 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000324 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000325 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000326 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328 }
329 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000330 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000331 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
332 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 }
334 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 }
336 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000337 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000338 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 if (unicode == NULL)
340 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000341 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
342 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000345 if (!unicode->str) {
346 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000347 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000349 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000350 * the caller fails before initializing str -- unicode_resize()
351 * reads str[0], and the Keep-Alive optimization can keep memory
352 * allocated for str alive across a call to unicode_dealloc(unicode).
353 * We don't want unicode_resize to read uninitialized memory in
354 * that case.
355 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000356 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000360 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000361 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000363
364 onError:
365 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000366 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368}
369
370static
Guido van Rossum9475a232001-10-05 20:51:39 +0000371void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372{
Walter Dörwald16807132007-05-25 13:52:07 +0000373 switch (PyUnicode_CHECK_INTERNED(unicode)) {
374 case SSTATE_NOT_INTERNED:
375 break;
376
377 case SSTATE_INTERNED_MORTAL:
378 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000379 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000380 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
381 Py_FatalError(
382 "deletion of interned unicode string failed");
383 break;
384
385 case SSTATE_INTERNED_IMMORTAL:
386 Py_FatalError("Immortal interned unicode string died.");
387
388 default:
389 Py_FatalError("Inconsistent interned unicode string state.");
390 }
391
Guido van Rossum604ddf82001-12-06 20:03:56 +0000392 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000393 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000394 /* Keep-Alive optimization */
395 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000396 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 unicode->str = NULL;
398 unicode->length = 0;
399 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000400 if (unicode->defenc) {
401 Py_DECREF(unicode->defenc);
402 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000403 }
404 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000405 *(PyUnicodeObject **)unicode = free_list;
406 free_list = unicode;
407 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000410 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000411 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000412 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414}
415
Martin v. Löwis18e16552006-02-15 17:27:45 +0000416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000417{
418 register PyUnicodeObject *v;
419
420 /* Argument checks */
421 if (unicode == NULL) {
422 PyErr_BadInternalCall();
423 return -1;
424 }
425 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000426 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 PyErr_BadInternalCall();
428 return -1;
429 }
430
431 /* Resizing unicode_empty and single character objects is not
432 possible since these are being shared. We simply return a fresh
433 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000434 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 (v == unicode_empty || v->length == 1)) {
436 PyUnicodeObject *w = _PyUnicode_New(length);
437 if (w == NULL)
438 return -1;
439 Py_UNICODE_COPY(w->str, v->str,
440 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000441 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 *unicode = (PyObject *)w;
443 return 0;
444 }
445
446 /* Note that we don't have to modify *unicode for unshared Unicode
447 objects, since we can modify them in-place. */
448 return unicode_resize(v, length);
449}
450
451/* Internal API for use in unicodeobject.c only ! */
452#define _PyUnicode_Resize(unicodevar, length) \
453 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000456 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457{
458 PyUnicodeObject *unicode;
459
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 /* If the Unicode data is known at construction time, we can apply
461 some optimizations which share commonly used objects. */
462 if (u != NULL) {
463
464 /* Optimization for empty strings */
465 if (size == 0 && unicode_empty != NULL) {
466 Py_INCREF(unicode_empty);
467 return (PyObject *)unicode_empty;
468 }
469
470 /* Single character Unicode objects in the Latin-1 range are
471 shared when using this constructor */
472 if (size == 1 && *u < 256) {
473 unicode = unicode_latin1[*u];
474 if (!unicode) {
475 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476 if (!unicode)
477 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000478 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479 unicode_latin1[*u] = unicode;
480 }
481 Py_INCREF(unicode);
482 return (PyObject *)unicode;
483 }
484 }
Tim Petersced69f82003-09-16 20:30:58 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 unicode = _PyUnicode_New(size);
487 if (!unicode)
488 return NULL;
489
490 /* Copy the Unicode data into the new object */
491 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000492 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493
494 return (PyObject *)unicode;
495}
496
Walter Dörwaldd2034312007-05-18 16:29:38 +0000497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000498{
499 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000500 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000501 some optimizations which share commonly used objects.
502 Also, this means the input must be UTF-8, so fall back to the
503 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000504 if (u != NULL) {
505
506 /* Optimization for empty strings */
507 if (size == 0 && unicode_empty != NULL) {
508 Py_INCREF(unicode_empty);
509 return (PyObject *)unicode_empty;
510 }
511
Martin v. Löwis9c121062007-08-05 20:26:11 +0000512 /* Single characters are shared when using this constructor.
513 Restrict to ASCII, since the input must be UTF-8. */
514 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000515 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000516 if (!unicode) {
517 unicode = _PyUnicode_New(1);
518 if (!unicode)
519 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000520 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000521 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000522 }
523 Py_INCREF(unicode);
524 return (PyObject *)unicode;
525 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000526
527 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 }
529
Walter Dörwald55507312007-05-18 13:12:10 +0000530 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (!unicode)
532 return NULL;
533
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 return (PyObject *)unicode;
535}
536
Walter Dörwaldd2034312007-05-18 16:29:38 +0000537PyObject *PyUnicode_FromString(const char *u)
538{
539 size_t size = strlen(u);
540 if (size > PY_SSIZE_T_MAX) {
541 PyErr_SetString(PyExc_OverflowError, "input too long");
542 return NULL;
543 }
544
545 return PyUnicode_FromStringAndSize(u, size);
546}
547
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548#ifdef HAVE_WCHAR_H
549
550PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000551 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552{
553 PyUnicodeObject *unicode;
554
555 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000556 if (size == 0)
557 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 PyErr_BadInternalCall();
559 return NULL;
560 }
561
Martin v. Löwis790465f2008-04-05 20:41:37 +0000562 if (size == -1) {
563 size = wcslen(w);
564 }
565
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 unicode = _PyUnicode_New(size);
567 if (!unicode)
568 return NULL;
569
570 /* Copy the wchar_t data into the new object */
571#ifdef HAVE_USABLE_WCHAR_T
572 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000573#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000574 {
575 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000576 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000578 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579 *u++ = *w++;
580 }
581#endif
582
583 return (PyObject *)unicode;
584}
585
Walter Dörwald346737f2007-05-31 10:44:43 +0000586static void
587makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
588{
589 *fmt++ = '%';
590 if (width) {
591 if (zeropad)
592 *fmt++ = '0';
593 fmt += sprintf(fmt, "%d", width);
594 }
595 if (precision)
596 fmt += sprintf(fmt, ".%d", precision);
597 if (longflag)
598 *fmt++ = 'l';
599 else if (size_tflag) {
600 char *f = PY_FORMAT_SIZE_T;
601 while (*f)
602 *fmt++ = *f++;
603 }
604 *fmt++ = c;
605 *fmt = '\0';
606}
607
Walter Dörwaldd2034312007-05-18 16:29:38 +0000608#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
609
610PyObject *
611PyUnicode_FromFormatV(const char *format, va_list vargs)
612{
613 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000614 Py_ssize_t callcount = 0;
615 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000616 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000617 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000618 int width = 0;
619 int precision = 0;
620 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000621 const char* f;
622 Py_UNICODE *s;
623 PyObject *string;
624 /* used by sprintf */
625 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000626 /* use abuffer instead of buffer, if we need more space
627 * (which can happen if there's a format specifier with width). */
628 char *abuffer = NULL;
629 char *realbuffer;
630 Py_ssize_t abuffersize = 0;
631 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000632 const char *copy;
633
634#ifdef VA_LIST_IS_ARRAY
635 Py_MEMCPY(count, vargs, sizeof(va_list));
636#else
637#ifdef __va_copy
638 __va_copy(count, vargs);
639#else
640 count = vargs;
641#endif
642#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000643 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000644 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000645 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000646 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000647 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000648 ++callcount;
649 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000650 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000651 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000652 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000653 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000654 if (!callresults) {
655 PyErr_NoMemory();
656 return NULL;
657 }
658 callresult = callresults;
659 }
660 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000661 for (f = format; *f; f++) {
662 if (*f == '%') {
663 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000664 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000665 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000666 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000667 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000668 ;
669
670 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
671 * they don't affect the amount of space we reserve.
672 */
673 if ((*f == 'l' || *f == 'z') &&
674 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000675 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000676
677 switch (*f) {
678 case 'c':
679 (void)va_arg(count, int);
680 /* fall through... */
681 case '%':
682 n++;
683 break;
684 case 'd': case 'u': case 'i': case 'x':
685 (void) va_arg(count, int);
686 /* 20 bytes is enough to hold a 64-bit
687 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000688 This isn't enough for octal.
689 If a width is specified we need more
690 (which we allocate later). */
691 if (width < 20)
692 width = 20;
693 n += width;
694 if (abuffersize < width)
695 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000696 break;
697 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000698 {
699 /* UTF-8 */
700 unsigned char*s;
701 s = va_arg(count, unsigned char*);
702 while (*s) {
703 if (*s < 128) {
704 n++; s++;
705 } else if (*s < 0xc0) {
706 /* invalid UTF-8 */
707 n++; s++;
708 } else if (*s < 0xc0) {
709 n++;
710 s++; if(!*s)break;
711 s++;
712 } else if (*s < 0xe0) {
713 n++;
714 s++; if(!*s)break;
715 s++; if(!*s)break;
716 s++;
717 } else {
718 #ifdef Py_UNICODE_WIDE
719 n++;
720 #else
721 n+=2;
722 #endif
723 s++; if(!*s)break;
724 s++; if(!*s)break;
725 s++; if(!*s)break;
726 s++;
727 }
728 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000730 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 case 'U':
732 {
733 PyObject *obj = va_arg(count, PyObject *);
734 assert(obj && PyUnicode_Check(obj));
735 n += PyUnicode_GET_SIZE(obj);
736 break;
737 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000738 case 'V':
739 {
740 PyObject *obj = va_arg(count, PyObject *);
741 const char *str = va_arg(count, const char *);
742 assert(obj || str);
743 assert(!obj || PyUnicode_Check(obj));
744 if (obj)
745 n += PyUnicode_GET_SIZE(obj);
746 else
747 n += strlen(str);
748 break;
749 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000750 case 'S':
751 {
752 PyObject *obj = va_arg(count, PyObject *);
753 PyObject *str;
754 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000755 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000756 if (!str)
757 goto fail;
758 n += PyUnicode_GET_SIZE(str);
759 /* Remember the str and switch to the next slot */
760 *callresult++ = str;
761 break;
762 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000763 case 'R':
764 {
765 PyObject *obj = va_arg(count, PyObject *);
766 PyObject *repr;
767 assert(obj);
768 repr = PyObject_Repr(obj);
769 if (!repr)
770 goto fail;
771 n += PyUnicode_GET_SIZE(repr);
772 /* Remember the repr and switch to the next slot */
773 *callresult++ = repr;
774 break;
775 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000776 case 'p':
777 (void) va_arg(count, int);
778 /* maximum 64-bit pointer representation:
779 * 0xffffffffffffffff
780 * so 19 characters is enough.
781 * XXX I count 18 -- what's the extra for?
782 */
783 n += 19;
784 break;
785 default:
786 /* if we stumble upon an unknown
787 formatting code, copy the rest of
788 the format string to the output
789 string. (we cannot just skip the
790 code, since there's no way to know
791 what's in the argument list) */
792 n += strlen(p);
793 goto expand;
794 }
795 } else
796 n++;
797 }
798 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000799 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000800 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000801 if (!abuffer) {
802 PyErr_NoMemory();
803 goto fail;
804 }
805 realbuffer = abuffer;
806 }
807 else
808 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000810 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000811 we don't have to resize the string.
812 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813 string = PyUnicode_FromUnicode(NULL, n);
814 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000815 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
817 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000818 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000819
820 for (f = format; *f; f++) {
821 if (*f == '%') {
822 const char* p = f++;
823 int longflag = 0;
824 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000825 zeropad = (*f == '0');
826 /* parse the width.precision part */
827 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000828 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000829 width = (width*10) + *f++ - '0';
830 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000831 if (*f == '.') {
832 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000833 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000834 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000835 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 /* handle the long flag, but only for %ld and %lu.
837 others can be added when necessary. */
838 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
839 longflag = 1;
840 ++f;
841 }
842 /* handle the size_t flag. */
843 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
844 size_tflag = 1;
845 ++f;
846 }
847
848 switch (*f) {
849 case 'c':
850 *s++ = va_arg(vargs, int);
851 break;
852 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000854 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000855 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000856 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000858 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 sprintf(realbuffer, fmt, va_arg(vargs, int));
860 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 break;
862 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000863 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000864 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000865 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000866 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000867 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000868 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000869 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
870 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000871 break;
872 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000873 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
874 sprintf(realbuffer, fmt, va_arg(vargs, int));
875 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000876 break;
877 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000878 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
879 sprintf(realbuffer, fmt, va_arg(vargs, int));
880 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000881 break;
882 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000883 {
884 /* Parameter must be UTF-8 encoded.
885 In case of encoding errors, use
886 the replacement character. */
887 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000888 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000889 u = PyUnicode_DecodeUTF8(p, strlen(p),
890 "replace");
891 if (!u)
892 goto fail;
893 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
894 PyUnicode_GET_SIZE(u));
895 s += PyUnicode_GET_SIZE(u);
896 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000898 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000899 case 'U':
900 {
901 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000902 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
903 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
904 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000905 break;
906 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000907 case 'V':
908 {
909 PyObject *obj = va_arg(vargs, PyObject *);
910 const char *str = va_arg(vargs, const char *);
911 if (obj) {
912 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
913 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
914 s += size;
915 } else {
916 appendstring(str);
917 }
918 break;
919 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000920 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000921 case 'R':
922 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000923 Py_UNICODE *ucopy;
924 Py_ssize_t usize;
925 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000926 /* unused, since we already have the result */
927 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000928 ucopy = PyUnicode_AS_UNICODE(*callresult);
929 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000930 for (upos = 0; upos<usize;)
931 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000932 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000933 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000934 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000935 ++callresult;
936 break;
937 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000938 case 'p':
939 sprintf(buffer, "%p", va_arg(vargs, void*));
940 /* %p is ill-defined: ensure leading 0x. */
941 if (buffer[1] == 'X')
942 buffer[1] = 'x';
943 else if (buffer[1] != 'x') {
944 memmove(buffer+2, buffer, strlen(buffer)+1);
945 buffer[0] = '0';
946 buffer[1] = 'x';
947 }
948 appendstring(buffer);
949 break;
950 case '%':
951 *s++ = '%';
952 break;
953 default:
954 appendstring(p);
955 goto end;
956 }
957 } else
958 *s++ = *f;
959 }
960
961 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000962 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000963 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000964 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000965 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000966 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
967 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000968 fail:
969 if (callresults) {
970 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000971 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000972 Py_DECREF(*callresult2);
973 ++callresult2;
974 }
Christian Heimesb186d002008-03-18 15:15:01 +0000975 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000976 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000977 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000978 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000979 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000980}
981
982#undef appendstring
983
984PyObject *
985PyUnicode_FromFormat(const char *format, ...)
986{
987 PyObject* ret;
988 va_list vargs;
989
990#ifdef HAVE_STDARG_PROTOTYPES
991 va_start(vargs, format);
992#else
993 va_start(vargs);
994#endif
995 ret = PyUnicode_FromFormatV(format, vargs);
996 va_end(vargs);
997 return ret;
998}
999
Martin v. Löwis18e16552006-02-15 17:27:45 +00001000Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1001 wchar_t *w,
1002 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001003{
1004 if (unicode == NULL) {
1005 PyErr_BadInternalCall();
1006 return -1;
1007 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001008
1009 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001010 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001011 size = PyUnicode_GET_SIZE(unicode) + 1;
1012
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013#ifdef HAVE_USABLE_WCHAR_T
1014 memcpy(w, unicode->str, size * sizeof(wchar_t));
1015#else
1016 {
1017 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001018 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001020 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021 *w++ = *u++;
1022 }
1023#endif
1024
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001025 if (size > PyUnicode_GET_SIZE(unicode))
1026 return PyUnicode_GET_SIZE(unicode);
1027 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 return size;
1029}
1030
1031#endif
1032
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001033PyObject *PyUnicode_FromOrdinal(int ordinal)
1034{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001035 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001036
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001037 if (ordinal < 0 || ordinal > 0x10ffff) {
1038 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001039 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001040 return NULL;
1041 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001042
1043#ifndef Py_UNICODE_WIDE
1044 if (ordinal > 0xffff) {
1045 ordinal -= 0x10000;
1046 s[0] = 0xD800 | (ordinal >> 10);
1047 s[1] = 0xDC00 | (ordinal & 0x3FF);
1048 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001049 }
1050#endif
1051
Hye-Shik Chang40574832004-04-06 07:24:51 +00001052 s[0] = (Py_UNICODE)ordinal;
1053 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001054}
1055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056PyObject *PyUnicode_FromObject(register PyObject *obj)
1057{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001058 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001059 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001060 if (PyUnicode_CheckExact(obj)) {
1061 Py_INCREF(obj);
1062 return obj;
1063 }
1064 if (PyUnicode_Check(obj)) {
1065 /* For a Unicode subtype that's not a Unicode object,
1066 return a true Unicode object with the same data. */
1067 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1068 PyUnicode_GET_SIZE(obj));
1069 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001070 PyErr_Format(PyExc_TypeError,
1071 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001072 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001073 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001074}
1075
1076PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1077 const char *encoding,
1078 const char *errors)
1079{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001080 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001082 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001083
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084 if (obj == NULL) {
1085 PyErr_BadInternalCall();
1086 return NULL;
1087 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001088
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001089 if (PyUnicode_Check(obj)) {
1090 PyErr_SetString(PyExc_TypeError,
1091 "decoding Unicode is not supported");
1092 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001093 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001094
1095 /* Coerce object */
1096 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001097 s = PyString_AS_STRING(obj);
1098 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001099 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001100 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1101 /* Overwrite the error message with something more useful in
1102 case of a TypeError. */
1103 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001104 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001105 "coercing to Unicode: need string or buffer, "
1106 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001107 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001108 goto onError;
1109 }
Tim Petersced69f82003-09-16 20:30:58 +00001110
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 if (len == 0) {
1113 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001114 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 }
Tim Petersced69f82003-09-16 20:30:58 +00001116 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001117 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001118
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001119 return v;
1120
1121 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123}
1124
1125PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001126 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 const char *encoding,
1128 const char *errors)
1129{
1130 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001131 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001132 char lower[20]; /* Enough for any encoding name we recognize */
1133 char *l;
1134 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001135
1136 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001137 encoding = PyUnicode_GetDefaultEncoding();
1138
1139 /* Convert encoding to lower case and replace '_' with '-' in order to
1140 catch e.g. UTF_8 */
1141 e = encoding;
1142 l = lower;
1143 while (*e && l < &lower[(sizeof lower) - 2]) {
1144 if (ISUPPER(*e)) {
1145 *l++ = TOLOWER(*e++);
1146 }
1147 else if (*e == '_') {
1148 *l++ = '-';
1149 e++;
1150 }
1151 else {
1152 *l++ = *e++;
1153 }
1154 }
1155 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001156
1157 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001158 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001160 else if ((strcmp(lower, "latin-1") == 0) ||
1161 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001162 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001163#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001164 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001165 return PyUnicode_DecodeMBCS(s, size, errors);
1166#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001167 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001168 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001169 else if (strcmp(lower, "utf-16") == 0)
1170 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1171 else if (strcmp(lower, "utf-32") == 0)
1172 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173
1174 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001175 buffer = NULL;
1176 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1177 goto onError;
1178 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 if (buffer == NULL)
1180 goto onError;
1181 unicode = PyCodec_Decode(buffer, encoding, errors);
1182 if (unicode == NULL)
1183 goto onError;
1184 if (!PyUnicode_Check(unicode)) {
1185 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001186 "decoder did not return an unicode object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001187 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 Py_DECREF(unicode);
1189 goto onError;
1190 }
1191 Py_DECREF(buffer);
1192 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001193
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194 onError:
1195 Py_XDECREF(buffer);
1196 return NULL;
1197}
1198
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001199PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1200 const char *encoding,
1201 const char *errors)
1202{
1203 PyObject *v;
1204
1205 if (!PyUnicode_Check(unicode)) {
1206 PyErr_BadArgument();
1207 goto onError;
1208 }
1209
1210 if (encoding == NULL)
1211 encoding = PyUnicode_GetDefaultEncoding();
1212
1213 /* Decode via the codec registry */
1214 v = PyCodec_Decode(unicode, encoding, errors);
1215 if (v == NULL)
1216 goto onError;
1217 return v;
1218
1219 onError:
1220 return NULL;
1221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001224 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225 const char *encoding,
1226 const char *errors)
1227{
1228 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001229
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 unicode = PyUnicode_FromUnicode(s, size);
1231 if (unicode == NULL)
1232 return NULL;
1233 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1234 Py_DECREF(unicode);
1235 return v;
1236}
1237
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001238PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1239 const char *encoding,
1240 const char *errors)
1241{
1242 PyObject *v;
1243
1244 if (!PyUnicode_Check(unicode)) {
1245 PyErr_BadArgument();
1246 goto onError;
1247 }
1248
1249 if (encoding == NULL)
1250 encoding = PyUnicode_GetDefaultEncoding();
1251
1252 /* Encode via the codec registry */
1253 v = PyCodec_Encode(unicode, encoding, errors);
1254 if (v == NULL)
1255 goto onError;
1256 return v;
1257
1258 onError:
1259 return NULL;
1260}
1261
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1263 const char *encoding,
1264 const char *errors)
1265{
1266 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001267
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 if (!PyUnicode_Check(unicode)) {
1269 PyErr_BadArgument();
1270 goto onError;
1271 }
Fred Drakee4315f52000-05-09 19:53:39 +00001272
Tim Petersced69f82003-09-16 20:30:58 +00001273 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001274 encoding = PyUnicode_GetDefaultEncoding();
1275
1276 /* Shortcuts for common default encodings */
1277 if (errors == NULL) {
1278 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001279 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001280 else if (strcmp(encoding, "latin-1") == 0)
1281 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001282#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1283 else if (strcmp(encoding, "mbcs") == 0)
1284 return PyUnicode_AsMBCSString(unicode);
1285#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001286 else if (strcmp(encoding, "ascii") == 0)
1287 return PyUnicode_AsASCIIString(unicode);
1288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 /* Encode via the codec registry */
1291 v = PyCodec_Encode(unicode, encoding, errors);
1292 if (v == NULL)
1293 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001294 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001296
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 onError:
1298 return NULL;
1299}
1300
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001301PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1302 const char *errors)
1303{
1304 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001305 if (v)
1306 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001307 if (errors != NULL)
1308 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001309 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001310 PyUnicode_GET_SIZE(unicode),
1311 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001312 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001313 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001314 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001315 return v;
1316}
1317
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001318PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001319PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001320 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001321 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1322}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001323
Christian Heimes5894ba72007-11-04 11:43:14 +00001324PyObject*
1325PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1326{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001327 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1328 can be undefined. If it is case, decode using UTF-8. The following assumes
1329 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1330 bootstrapping process where the codecs aren't ready yet.
1331 */
1332 if (Py_FileSystemDefaultEncoding) {
1333#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001334 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001335 return PyUnicode_DecodeMBCS(s, size, "replace");
1336 }
1337#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001338 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001339 return PyUnicode_DecodeUTF8(s, size, "replace");
1340 }
1341#endif
1342 return PyUnicode_Decode(s, size,
1343 Py_FileSystemDefaultEncoding,
1344 "replace");
1345 }
1346 else {
1347 return PyUnicode_DecodeUTF8(s, size, "replace");
1348 }
1349}
1350
Martin v. Löwis5b222132007-06-10 09:51:05 +00001351char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001352PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001353{
Christian Heimesf3863112007-11-22 07:46:41 +00001354 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001355 if (!PyUnicode_Check(unicode)) {
1356 PyErr_BadArgument();
1357 return NULL;
1358 }
Christian Heimesf3863112007-11-22 07:46:41 +00001359 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1360 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001361 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001362 if (psize != NULL)
Christian Heimesf3863112007-11-22 07:46:41 +00001363 *psize = PyString_GET_SIZE(bytes);
1364 return PyString_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001365}
1366
1367char*
1368PyUnicode_AsString(PyObject *unicode)
1369{
1370 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001371}
1372
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1374{
1375 if (!PyUnicode_Check(unicode)) {
1376 PyErr_BadArgument();
1377 goto onError;
1378 }
1379 return PyUnicode_AS_UNICODE(unicode);
1380
1381 onError:
1382 return NULL;
1383}
1384
Martin v. Löwis18e16552006-02-15 17:27:45 +00001385Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386{
1387 if (!PyUnicode_Check(unicode)) {
1388 PyErr_BadArgument();
1389 goto onError;
1390 }
1391 return PyUnicode_GET_SIZE(unicode);
1392
1393 onError:
1394 return -1;
1395}
1396
Thomas Wouters78890102000-07-22 19:25:51 +00001397const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001398{
1399 return unicode_default_encoding;
1400}
1401
1402int PyUnicode_SetDefaultEncoding(const char *encoding)
1403{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001404 if (strcmp(encoding, unicode_default_encoding) != 0) {
1405 PyErr_Format(PyExc_ValueError,
1406 "Can only set default encoding to %s",
1407 unicode_default_encoding);
1408 return -1;
1409 }
Fred Drakee4315f52000-05-09 19:53:39 +00001410 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001411}
1412
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001413/* error handling callback helper:
1414 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001415 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 and adjust various state variables.
1417 return 0 on success, -1 on error
1418*/
1419
1420static
1421int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1422 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001423 const char **input, const char **inend, Py_ssize_t *startinpos,
1424 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001425 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001427 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001428
1429 PyObject *restuple = NULL;
1430 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001431 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001432 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001433 Py_ssize_t requiredsize;
1434 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001436 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001437 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001438 int res = -1;
1439
1440 if (*errorHandler == NULL) {
1441 *errorHandler = PyCodec_LookupError(errors);
1442 if (*errorHandler == NULL)
1443 goto onError;
1444 }
1445
1446 if (*exceptionObject == NULL) {
1447 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001448 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 if (*exceptionObject == NULL)
1450 goto onError;
1451 }
1452 else {
1453 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1454 goto onError;
1455 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1456 goto onError;
1457 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1458 goto onError;
1459 }
1460
1461 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1462 if (restuple == NULL)
1463 goto onError;
1464 if (!PyTuple_Check(restuple)) {
1465 PyErr_Format(PyExc_TypeError, &argparse[4]);
1466 goto onError;
1467 }
1468 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1469 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001470
1471 /* Copy back the bytes variables, which might have been modified by the
1472 callback */
1473 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1474 if (!inputobj)
1475 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001476 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001477 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1478 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001479 *input = PyString_AS_STRING(inputobj);
1480 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001481 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001482 /* we can DECREF safely, as the exception has another reference,
1483 so the object won't go away. */
1484 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001486 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001487 newpos = insize+newpos;
1488 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001489 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001490 goto onError;
1491 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001492
1493 /* need more space? (at least enough for what we
1494 have+the replacement+the rest of the string (starting
1495 at the new input position), so we won't have to check space
1496 when there are no errors in the rest of the string) */
1497 repptr = PyUnicode_AS_UNICODE(repunicode);
1498 repsize = PyUnicode_GET_SIZE(repunicode);
1499 requiredsize = *outpos + repsize + insize-newpos;
1500 if (requiredsize > outsize) {
1501 if (requiredsize<2*outsize)
1502 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001503 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 goto onError;
1505 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1506 }
1507 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001508 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001509 Py_UNICODE_COPY(*outptr, repptr, repsize);
1510 *outptr += repsize;
1511 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001513 /* we made it! */
1514 res = 0;
1515
1516 onError:
1517 Py_XDECREF(restuple);
1518 return res;
1519}
1520
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001521/* --- UTF-7 Codec -------------------------------------------------------- */
1522
1523/* see RFC2152 for details */
1524
Tim Petersced69f82003-09-16 20:30:58 +00001525static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526char utf7_special[128] = {
1527 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1528 encoded:
1529 0 - not special
1530 1 - special
1531 2 - whitespace (optional)
1532 3 - RFC2152 Set O (optional) */
1533 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1535 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1536 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1537 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1539 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1541
1542};
1543
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001544/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1545 warnings about the comparison always being false; since
1546 utf7_special[0] is 1, we can safely make that one comparison
1547 true */
1548
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001550 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001551 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001552 (encodeO && (utf7_special[(c)] == 3)))
1553
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001554#define B64(n) \
1555 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1556#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001557 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001558#define UB64(c) \
1559 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1560 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001562#define ENCODE(out, ch, bits) \
1563 while (bits >= 6) { \
1564 *out++ = B64(ch >> (bits-6)); \
1565 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001566 }
1567
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001568#define DECODE(out, ch, bits, surrogate) \
1569 while (bits >= 16) { \
1570 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1571 bits -= 16; \
1572 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001573 /* We have already generated an error for the high surrogate \
1574 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001575 surrogate = 0; \
1576 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001578 it in a 16-bit character */ \
1579 surrogate = 1; \
1580 errmsg = "code pairs are not supported"; \
1581 goto utf7Error; \
1582 } else { \
1583 *out++ = outCh; \
1584 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001585 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001586
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001588 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 const char *errors)
1590{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001591 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1592}
1593
1594PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1595 Py_ssize_t size,
1596 const char *errors,
1597 Py_ssize_t *consumed)
1598{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001599 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001600 Py_ssize_t startinpos;
1601 Py_ssize_t endinpos;
1602 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603 const char *e;
1604 PyUnicodeObject *unicode;
1605 Py_UNICODE *p;
1606 const char *errmsg = "";
1607 int inShift = 0;
1608 unsigned int bitsleft = 0;
1609 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 int surrogate = 0;
1611 PyObject *errorHandler = NULL;
1612 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613
1614 unicode = _PyUnicode_New(size);
1615 if (!unicode)
1616 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001617 if (size == 0) {
1618 if (consumed)
1619 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001621 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622
1623 p = unicode->str;
1624 e = s + size;
1625
1626 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001627 Py_UNICODE ch;
1628 restart:
1629 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630
1631 if (inShift) {
1632 if ((ch == '-') || !B64CHAR(ch)) {
1633 inShift = 0;
1634 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001635
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1637 if (bitsleft >= 6) {
1638 /* The shift sequence has a partial character in it. If
1639 bitsleft < 6 then we could just classify it as padding
1640 but that is not the case here */
1641
1642 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001643 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001644 }
1645 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001646 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 here so indicate the potential of a misencoded character. */
1648
1649 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1650 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1651 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001652 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 }
1654
1655 if (ch == '-') {
1656 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001657 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 inShift = 1;
1659 }
1660 } else if (SPECIAL(ch,0,0)) {
1661 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001662 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663 } else {
1664 *p++ = ch;
1665 }
1666 } else {
1667 charsleft = (charsleft << 6) | UB64(ch);
1668 bitsleft += 6;
1669 s++;
1670 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1671 }
1672 }
1673 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 s++;
1676 if (s < e && *s == '-') {
1677 s++;
1678 *p++ = '+';
1679 } else
1680 {
1681 inShift = 1;
1682 bitsleft = 0;
1683 }
1684 }
1685 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001686 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001687 errmsg = "unexpected special character";
1688 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001689 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690 }
1691 else {
1692 *p++ = ch;
1693 s++;
1694 }
1695 continue;
1696 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001697 outpos = p-PyUnicode_AS_UNICODE(unicode);
1698 endinpos = s-starts;
1699 if (unicode_decode_call_errorhandler(
1700 errors, &errorHandler,
1701 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001702 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001703 (PyObject **)&unicode, &outpos, &p))
1704 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001705 }
1706
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001707 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001708 outpos = p-PyUnicode_AS_UNICODE(unicode);
1709 endinpos = size;
1710 if (unicode_decode_call_errorhandler(
1711 errors, &errorHandler,
1712 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001713 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001716 if (s < e)
1717 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001719 if (consumed) {
1720 if(inShift)
1721 *consumed = startinpos;
1722 else
1723 *consumed = s-starts;
1724 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001726 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001727 goto onError;
1728
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 Py_XDECREF(errorHandler);
1730 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001731 return (PyObject *)unicode;
1732
1733onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001734 Py_XDECREF(errorHandler);
1735 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001736 Py_DECREF(unicode);
1737 return NULL;
1738}
1739
1740
1741PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001742 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001743 int encodeSetO,
1744 int encodeWhiteSpace,
1745 const char *errors)
1746{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001747 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001748 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001749 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001751 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001752 unsigned int bitsleft = 0;
1753 unsigned long charsleft = 0;
1754 char * out;
1755 char * start;
1756
1757 if (size == 0)
Christian Heimesf3863112007-11-22 07:46:41 +00001758 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759
Walter Dörwald51ab4142007-05-05 14:43:36 +00001760 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 if (v == NULL)
1762 return NULL;
1763
Walter Dörwald51ab4142007-05-05 14:43:36 +00001764 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765 for (;i < size; ++i) {
1766 Py_UNICODE ch = s[i];
1767
1768 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001769 if (ch == '+') {
1770 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001771 *out++ = '-';
1772 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1773 charsleft = ch;
1774 bitsleft = 16;
1775 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001776 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001777 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001778 } else {
1779 *out++ = (char) ch;
1780 }
1781 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1783 *out++ = B64(charsleft << (6-bitsleft));
1784 charsleft = 0;
1785 bitsleft = 0;
1786 /* Characters not in the BASE64 set implicitly unshift the sequence
1787 so no '-' is required, except if the character is itself a '-' */
1788 if (B64CHAR(ch) || ch == '-') {
1789 *out++ = '-';
1790 }
1791 inShift = 0;
1792 *out++ = (char) ch;
1793 } else {
1794 bitsleft += 16;
1795 charsleft = (charsleft << 16) | ch;
1796 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1797
1798 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001799 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001800 or '-' then the shift sequence will be terminated implicitly and we
1801 don't have to insert a '-'. */
1802
1803 if (bitsleft == 0) {
1804 if (i + 1 < size) {
1805 Py_UNICODE ch2 = s[i+1];
1806
1807 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001808
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001809 } else if (B64CHAR(ch2) || ch2 == '-') {
1810 *out++ = '-';
1811 inShift = 0;
1812 } else {
1813 inShift = 0;
1814 }
1815
1816 }
1817 else {
1818 *out++ = '-';
1819 inShift = 0;
1820 }
1821 }
Tim Petersced69f82003-09-16 20:30:58 +00001822 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001823 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001824 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001825 if (bitsleft) {
1826 *out++= B64(charsleft << (6-bitsleft) );
1827 *out++ = '-';
1828 }
1829
Guido van Rossum98297ee2007-11-06 21:34:58 +00001830 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1831 Py_DECREF(v);
1832 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001833}
1834
1835#undef SPECIAL
1836#undef B64
1837#undef B64CHAR
1838#undef UB64
1839#undef ENCODE
1840#undef DECODE
1841
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842/* --- UTF-8 Codec -------------------------------------------------------- */
1843
Tim Petersced69f82003-09-16 20:30:58 +00001844static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845char utf8_code_length[256] = {
1846 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1847 illegal prefix. see RFC 2279 for details */
1848 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1849 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1850 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1851 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1852 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1853 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1854 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1858 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1859 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1860 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1861 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1862 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1863 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1864};
1865
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001867 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 const char *errors)
1869{
Walter Dörwald69652032004-09-07 20:24:22 +00001870 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1871}
1872
1873PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001874 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001875 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001876 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001877{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001880 Py_ssize_t startinpos;
1881 Py_ssize_t endinpos;
1882 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 const char *e;
1884 PyUnicodeObject *unicode;
1885 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001886 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001887 PyObject *errorHandler = NULL;
1888 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889
1890 /* Note: size will always be longer than the resulting Unicode
1891 character count */
1892 unicode = _PyUnicode_New(size);
1893 if (!unicode)
1894 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001895 if (size == 0) {
1896 if (consumed)
1897 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001899 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900
1901 /* Unpack UTF-8 encoded data */
1902 p = unicode->str;
1903 e = s + size;
1904
1905 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001906 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907
1908 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001909 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910 s++;
1911 continue;
1912 }
1913
1914 n = utf8_code_length[ch];
1915
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001917 if (consumed)
1918 break;
1919 else {
1920 errmsg = "unexpected end of data";
1921 startinpos = s-starts;
1922 endinpos = size;
1923 goto utf8Error;
1924 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926
1927 switch (n) {
1928
1929 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001930 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001931 startinpos = s-starts;
1932 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934
1935 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001937 startinpos = s-starts;
1938 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001939 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940
1941 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001942 if ((s[1] & 0xc0) != 0x80) {
1943 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001944 startinpos = s-starts;
1945 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001946 goto utf8Error;
1947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001949 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001950 startinpos = s-starts;
1951 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001952 errmsg = "illegal encoding";
1953 goto utf8Error;
1954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001956 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 break;
1958
1959 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001960 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001961 (s[2] & 0xc0) != 0x80) {
1962 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001963 startinpos = s-starts;
1964 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001965 goto utf8Error;
1966 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001968 if (ch < 0x0800) {
1969 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001970 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001971
1972 XXX For wide builds (UCS-4) we should probably try
1973 to recombine the surrogates into a single code
1974 unit.
1975 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001976 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001977 startinpos = s-starts;
1978 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001979 goto utf8Error;
1980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001982 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001983 break;
1984
1985 case 4:
1986 if ((s[1] & 0xc0) != 0x80 ||
1987 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001988 (s[3] & 0xc0) != 0x80) {
1989 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 startinpos = s-starts;
1991 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001992 goto utf8Error;
1993 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001994 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1995 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1996 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001997 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001998 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001999 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002000 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002001 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002002 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002003 startinpos = s-starts;
2004 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002005 goto utf8Error;
2006 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002007#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002008 *p++ = (Py_UNICODE)ch;
2009#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002010 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002011
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002012 /* translate from 10000..10FFFF to 0..FFFF */
2013 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002014
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002015 /* high surrogate = top 10 bits added to D800 */
2016 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002017
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002018 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002019 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002020#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 break;
2022
2023 default:
2024 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002025 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026 startinpos = s-starts;
2027 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002028 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 }
2030 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002031 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002032
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002033 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002034 outpos = p-PyUnicode_AS_UNICODE(unicode);
2035 if (unicode_decode_call_errorhandler(
2036 errors, &errorHandler,
2037 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002038 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039 (PyObject **)&unicode, &outpos, &p))
2040 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 }
Walter Dörwald69652032004-09-07 20:24:22 +00002042 if (consumed)
2043 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002046 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 goto onError;
2048
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 Py_XDECREF(errorHandler);
2050 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 return (PyObject *)unicode;
2052
2053onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054 Py_XDECREF(errorHandler);
2055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 Py_DECREF(unicode);
2057 return NULL;
2058}
2059
Tim Peters602f7402002-04-27 18:03:26 +00002060/* Allocation strategy: if the string is short, convert into a stack buffer
2061 and allocate exactly as much space needed at the end. Else allocate the
2062 maximum possible needed (4 result bytes per Unicode character), and return
2063 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002064*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002065PyObject *
2066PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002067 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002068 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069{
Tim Peters602f7402002-04-27 18:03:26 +00002070#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002071
Guido van Rossum98297ee2007-11-06 21:34:58 +00002072 Py_ssize_t i; /* index into s of next input byte */
2073 PyObject *result; /* result string object */
2074 char *p; /* next free byte in output buffer */
2075 Py_ssize_t nallocated; /* number of result bytes allocated */
2076 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002077 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002078
Tim Peters602f7402002-04-27 18:03:26 +00002079 assert(s != NULL);
2080 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081
Tim Peters602f7402002-04-27 18:03:26 +00002082 if (size <= MAX_SHORT_UNICHARS) {
2083 /* Write into the stack buffer; nallocated can't overflow.
2084 * At the end, we'll allocate exactly as much heap space as it
2085 * turns out we need.
2086 */
2087 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002088 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002089 p = stackbuf;
2090 }
2091 else {
2092 /* Overallocate on the heap, and give the excess back at the end. */
2093 nallocated = size * 4;
2094 if (nallocated / 4 != size) /* overflow! */
2095 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002096 result = PyString_FromStringAndSize(NULL, nallocated);
2097 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002098 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002099 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002100 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002101
Tim Peters602f7402002-04-27 18:03:26 +00002102 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002103 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002104
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002105 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002106 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002108
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002110 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002111 *p++ = (char)(0xc0 | (ch >> 6));
2112 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002113 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002114 else {
Tim Peters602f7402002-04-27 18:03:26 +00002115 /* Encode UCS2 Unicode ordinals */
2116 if (ch < 0x10000) {
2117 /* Special case: check for high surrogate */
2118 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2119 Py_UCS4 ch2 = s[i];
2120 /* Check for low surrogate and combine the two to
2121 form a UCS4 value */
2122 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002123 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002124 i++;
2125 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002126 }
Tim Peters602f7402002-04-27 18:03:26 +00002127 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002128 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002129 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002130 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2131 *p++ = (char)(0x80 | (ch & 0x3f));
2132 continue;
2133 }
2134encodeUCS4:
2135 /* Encode UCS4 Unicode ordinals */
2136 *p++ = (char)(0xf0 | (ch >> 18));
2137 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2138 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2139 *p++ = (char)(0x80 | (ch & 0x3f));
2140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002142
Guido van Rossum98297ee2007-11-06 21:34:58 +00002143 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002144 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002145 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002146 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002147 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002148 }
2149 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002150 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002151 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002152 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002153 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002154 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002155 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002156
Tim Peters602f7402002-04-27 18:03:26 +00002157#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158}
2159
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2161{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 if (!PyUnicode_Check(unicode)) {
2163 PyErr_BadArgument();
2164 return NULL;
2165 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002166 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2167 PyUnicode_GET_SIZE(unicode),
2168 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169}
2170
Walter Dörwald41980ca2007-08-16 21:55:45 +00002171/* --- UTF-32 Codec ------------------------------------------------------- */
2172
2173PyObject *
2174PyUnicode_DecodeUTF32(const char *s,
2175 Py_ssize_t size,
2176 const char *errors,
2177 int *byteorder)
2178{
2179 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2180}
2181
2182PyObject *
2183PyUnicode_DecodeUTF32Stateful(const char *s,
2184 Py_ssize_t size,
2185 const char *errors,
2186 int *byteorder,
2187 Py_ssize_t *consumed)
2188{
2189 const char *starts = s;
2190 Py_ssize_t startinpos;
2191 Py_ssize_t endinpos;
2192 Py_ssize_t outpos;
2193 PyUnicodeObject *unicode;
2194 Py_UNICODE *p;
2195#ifndef Py_UNICODE_WIDE
2196 int i, pairs;
2197#else
2198 const int pairs = 0;
2199#endif
2200 const unsigned char *q, *e;
2201 int bo = 0; /* assume native ordering by default */
2202 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002203 /* Offsets from q for retrieving bytes in the right order. */
2204#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2205 int iorder[] = {0, 1, 2, 3};
2206#else
2207 int iorder[] = {3, 2, 1, 0};
2208#endif
2209 PyObject *errorHandler = NULL;
2210 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002211 /* On narrow builds we split characters outside the BMP into two
2212 codepoints => count how much extra space we need. */
2213#ifndef Py_UNICODE_WIDE
2214 for (i = pairs = 0; i < size/4; i++)
2215 if (((Py_UCS4 *)s)[i] >= 0x10000)
2216 pairs++;
2217#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002218
2219 /* This might be one to much, because of a BOM */
2220 unicode = _PyUnicode_New((size+3)/4+pairs);
2221 if (!unicode)
2222 return NULL;
2223 if (size == 0)
2224 return (PyObject *)unicode;
2225
2226 /* Unpack UTF-32 encoded data */
2227 p = unicode->str;
2228 q = (unsigned char *)s;
2229 e = q + size;
2230
2231 if (byteorder)
2232 bo = *byteorder;
2233
2234 /* Check for BOM marks (U+FEFF) in the input and adjust current
2235 byte order setting accordingly. In native mode, the leading BOM
2236 mark is skipped, in all other modes, it is copied to the output
2237 stream as-is (giving a ZWNBSP character). */
2238 if (bo == 0) {
2239 if (size >= 4) {
2240 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2241 (q[iorder[1]] << 8) | q[iorder[0]];
2242#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2243 if (bom == 0x0000FEFF) {
2244 q += 4;
2245 bo = -1;
2246 }
2247 else if (bom == 0xFFFE0000) {
2248 q += 4;
2249 bo = 1;
2250 }
2251#else
2252 if (bom == 0x0000FEFF) {
2253 q += 4;
2254 bo = 1;
2255 }
2256 else if (bom == 0xFFFE0000) {
2257 q += 4;
2258 bo = -1;
2259 }
2260#endif
2261 }
2262 }
2263
2264 if (bo == -1) {
2265 /* force LE */
2266 iorder[0] = 0;
2267 iorder[1] = 1;
2268 iorder[2] = 2;
2269 iorder[3] = 3;
2270 }
2271 else if (bo == 1) {
2272 /* force BE */
2273 iorder[0] = 3;
2274 iorder[1] = 2;
2275 iorder[2] = 1;
2276 iorder[3] = 0;
2277 }
2278
2279 while (q < e) {
2280 Py_UCS4 ch;
2281 /* remaining bytes at the end? (size should be divisible by 4) */
2282 if (e-q<4) {
2283 if (consumed)
2284 break;
2285 errmsg = "truncated data";
2286 startinpos = ((const char *)q)-starts;
2287 endinpos = ((const char *)e)-starts;
2288 goto utf32Error;
2289 /* The remaining input chars are ignored if the callback
2290 chooses to skip the input */
2291 }
2292 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2293 (q[iorder[1]] << 8) | q[iorder[0]];
2294
2295 if (ch >= 0x110000)
2296 {
2297 errmsg = "codepoint not in range(0x110000)";
2298 startinpos = ((const char *)q)-starts;
2299 endinpos = startinpos+4;
2300 goto utf32Error;
2301 }
2302#ifndef Py_UNICODE_WIDE
2303 if (ch >= 0x10000)
2304 {
2305 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2306 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2307 }
2308 else
2309#endif
2310 *p++ = ch;
2311 q += 4;
2312 continue;
2313 utf32Error:
2314 outpos = p-PyUnicode_AS_UNICODE(unicode);
2315 if (unicode_decode_call_errorhandler(
2316 errors, &errorHandler,
2317 "utf32", errmsg,
2318 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2319 (PyObject **)&unicode, &outpos, &p))
2320 goto onError;
2321 }
2322
2323 if (byteorder)
2324 *byteorder = bo;
2325
2326 if (consumed)
2327 *consumed = (const char *)q-starts;
2328
2329 /* Adjust length */
2330 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2331 goto onError;
2332
2333 Py_XDECREF(errorHandler);
2334 Py_XDECREF(exc);
2335 return (PyObject *)unicode;
2336
2337onError:
2338 Py_DECREF(unicode);
2339 Py_XDECREF(errorHandler);
2340 Py_XDECREF(exc);
2341 return NULL;
2342}
2343
2344PyObject *
2345PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2346 Py_ssize_t size,
2347 const char *errors,
2348 int byteorder)
2349{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002350 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002351 unsigned char *p;
2352#ifndef Py_UNICODE_WIDE
2353 int i, pairs;
2354#else
2355 const int pairs = 0;
2356#endif
2357 /* Offsets from p for storing byte pairs in the right order. */
2358#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2359 int iorder[] = {0, 1, 2, 3};
2360#else
2361 int iorder[] = {3, 2, 1, 0};
2362#endif
2363
2364#define STORECHAR(CH) \
2365 do { \
2366 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2367 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2368 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2369 p[iorder[0]] = (CH) & 0xff; \
2370 p += 4; \
2371 } while(0)
2372
2373 /* In narrow builds we can output surrogate pairs as one codepoint,
2374 so we need less space. */
2375#ifndef Py_UNICODE_WIDE
2376 for (i = pairs = 0; i < size-1; i++)
2377 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2378 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2379 pairs++;
2380#endif
2381 v = PyBytes_FromStringAndSize(NULL,
2382 4 * (size - pairs + (byteorder == 0)));
2383 if (v == NULL)
2384 return NULL;
2385
2386 p = (unsigned char *)PyBytes_AS_STRING(v);
2387 if (byteorder == 0)
2388 STORECHAR(0xFEFF);
2389 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002390 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002391
2392 if (byteorder == -1) {
2393 /* force LE */
2394 iorder[0] = 0;
2395 iorder[1] = 1;
2396 iorder[2] = 2;
2397 iorder[3] = 3;
2398 }
2399 else if (byteorder == 1) {
2400 /* force BE */
2401 iorder[0] = 3;
2402 iorder[1] = 2;
2403 iorder[2] = 1;
2404 iorder[3] = 0;
2405 }
2406
2407 while (size-- > 0) {
2408 Py_UCS4 ch = *s++;
2409#ifndef Py_UNICODE_WIDE
2410 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2411 Py_UCS4 ch2 = *s;
2412 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2413 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2414 s++;
2415 size--;
2416 }
2417 }
2418#endif
2419 STORECHAR(ch);
2420 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002421
2422 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002423 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002424 Py_DECREF(v);
2425 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002426#undef STORECHAR
2427}
2428
2429PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2430{
2431 if (!PyUnicode_Check(unicode)) {
2432 PyErr_BadArgument();
2433 return NULL;
2434 }
2435 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2436 PyUnicode_GET_SIZE(unicode),
2437 NULL,
2438 0);
2439}
2440
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441/* --- UTF-16 Codec ------------------------------------------------------- */
2442
Tim Peters772747b2001-08-09 22:21:55 +00002443PyObject *
2444PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002445 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002446 const char *errors,
2447 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448{
Walter Dörwald69652032004-09-07 20:24:22 +00002449 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2450}
2451
2452PyObject *
2453PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002454 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002455 const char *errors,
2456 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002457 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002458{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002459 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002460 Py_ssize_t startinpos;
2461 Py_ssize_t endinpos;
2462 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 PyUnicodeObject *unicode;
2464 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002465 const unsigned char *q, *e;
2466 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002467 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002468 /* Offsets from q for retrieving byte pairs in the right order. */
2469#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2470 int ihi = 1, ilo = 0;
2471#else
2472 int ihi = 0, ilo = 1;
2473#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002474 PyObject *errorHandler = NULL;
2475 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476
2477 /* Note: size will always be longer than the resulting Unicode
2478 character count */
2479 unicode = _PyUnicode_New(size);
2480 if (!unicode)
2481 return NULL;
2482 if (size == 0)
2483 return (PyObject *)unicode;
2484
2485 /* Unpack UTF-16 encoded data */
2486 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002487 q = (unsigned char *)s;
2488 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489
2490 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002491 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002493 /* Check for BOM marks (U+FEFF) in the input and adjust current
2494 byte order setting accordingly. In native mode, the leading BOM
2495 mark is skipped, in all other modes, it is copied to the output
2496 stream as-is (giving a ZWNBSP character). */
2497 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002498 if (size >= 2) {
2499 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002500#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002501 if (bom == 0xFEFF) {
2502 q += 2;
2503 bo = -1;
2504 }
2505 else if (bom == 0xFFFE) {
2506 q += 2;
2507 bo = 1;
2508 }
Tim Petersced69f82003-09-16 20:30:58 +00002509#else
Walter Dörwald69652032004-09-07 20:24:22 +00002510 if (bom == 0xFEFF) {
2511 q += 2;
2512 bo = 1;
2513 }
2514 else if (bom == 0xFFFE) {
2515 q += 2;
2516 bo = -1;
2517 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002518#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002519 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521
Tim Peters772747b2001-08-09 22:21:55 +00002522 if (bo == -1) {
2523 /* force LE */
2524 ihi = 1;
2525 ilo = 0;
2526 }
2527 else if (bo == 1) {
2528 /* force BE */
2529 ihi = 0;
2530 ilo = 1;
2531 }
2532
2533 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002535 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002537 if (consumed)
2538 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539 errmsg = "truncated data";
2540 startinpos = ((const char *)q)-starts;
2541 endinpos = ((const char *)e)-starts;
2542 goto utf16Error;
2543 /* The remaining input chars are ignored if the callback
2544 chooses to skip the input */
2545 }
2546 ch = (q[ihi] << 8) | q[ilo];
2547
Tim Peters772747b2001-08-09 22:21:55 +00002548 q += 2;
2549
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 if (ch < 0xD800 || ch > 0xDFFF) {
2551 *p++ = ch;
2552 continue;
2553 }
2554
2555 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002556 if (q >= e) {
2557 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558 startinpos = (((const char *)q)-2)-starts;
2559 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002560 goto utf16Error;
2561 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002562 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002563 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2564 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002565 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002566#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002567 *p++ = ch;
2568 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002569#else
2570 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002571#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002572 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002573 }
2574 else {
2575 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002576 startinpos = (((const char *)q)-4)-starts;
2577 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002578 goto utf16Error;
2579 }
2580
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002582 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 startinpos = (((const char *)q)-2)-starts;
2584 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002585 /* Fall through to report the error */
2586
2587 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 outpos = p-PyUnicode_AS_UNICODE(unicode);
2589 if (unicode_decode_call_errorhandler(
2590 errors, &errorHandler,
2591 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002592 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002593 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002594 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 }
2596
2597 if (byteorder)
2598 *byteorder = bo;
2599
Walter Dörwald69652032004-09-07 20:24:22 +00002600 if (consumed)
2601 *consumed = (const char *)q-starts;
2602
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002604 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 goto onError;
2606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 Py_XDECREF(errorHandler);
2608 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 return (PyObject *)unicode;
2610
2611onError:
2612 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002613 Py_XDECREF(errorHandler);
2614 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 return NULL;
2616}
2617
Tim Peters772747b2001-08-09 22:21:55 +00002618PyObject *
2619PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002620 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002621 const char *errors,
2622 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002624 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002625 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002626#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002627 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002628#else
2629 const int pairs = 0;
2630#endif
Tim Peters772747b2001-08-09 22:21:55 +00002631 /* Offsets from p for storing byte pairs in the right order. */
2632#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2633 int ihi = 1, ilo = 0;
2634#else
2635 int ihi = 0, ilo = 1;
2636#endif
2637
2638#define STORECHAR(CH) \
2639 do { \
2640 p[ihi] = ((CH) >> 8) & 0xff; \
2641 p[ilo] = (CH) & 0xff; \
2642 p += 2; \
2643 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002645#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002646 for (i = pairs = 0; i < size; i++)
2647 if (s[i] >= 0x10000)
2648 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002649#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002650 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002651 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 if (v == NULL)
2653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654
Walter Dörwald3cc34522007-05-04 10:48:27 +00002655 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002657 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002658 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002659 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002660
2661 if (byteorder == -1) {
2662 /* force LE */
2663 ihi = 1;
2664 ilo = 0;
2665 }
2666 else if (byteorder == 1) {
2667 /* force BE */
2668 ihi = 0;
2669 ilo = 1;
2670 }
2671
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002672 while (size-- > 0) {
2673 Py_UNICODE ch = *s++;
2674 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002675#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002676 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002677 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2678 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002680#endif
Tim Peters772747b2001-08-09 22:21:55 +00002681 STORECHAR(ch);
2682 if (ch2)
2683 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002684 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002685
2686 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002687 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002688 Py_DECREF(v);
2689 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002690#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691}
2692
2693PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2694{
2695 if (!PyUnicode_Check(unicode)) {
2696 PyErr_BadArgument();
2697 return NULL;
2698 }
2699 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2700 PyUnicode_GET_SIZE(unicode),
2701 NULL,
2702 0);
2703}
2704
2705/* --- Unicode Escape Codec ----------------------------------------------- */
2706
Fredrik Lundh06d12682001-01-24 07:59:11 +00002707static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002708
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002710 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 const char *errors)
2712{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002714 Py_ssize_t startinpos;
2715 Py_ssize_t endinpos;
2716 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002721 char* message;
2722 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 PyObject *errorHandler = NULL;
2724 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002725
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 /* Escaped strings will always be longer than the resulting
2727 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 length after conversion to the true value.
2729 (but if the error callback returns a long replacement string
2730 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 v = _PyUnicode_New(size);
2732 if (v == NULL)
2733 goto onError;
2734 if (size == 0)
2735 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002739
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 while (s < end) {
2741 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002742 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744
2745 /* Non-escape characters are interpreted as Unicode ordinals */
2746 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002747 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 continue;
2749 }
2750
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 /* \ - Escapes */
2753 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002754 c = *s++;
2755 if (s > end)
2756 c = '\0'; /* Invalid after \ */
2757 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758
2759 /* \x escapes */
2760 case '\n': break;
2761 case '\\': *p++ = '\\'; break;
2762 case '\'': *p++ = '\''; break;
2763 case '\"': *p++ = '\"'; break;
2764 case 'b': *p++ = '\b'; break;
2765 case 'f': *p++ = '\014'; break; /* FF */
2766 case 't': *p++ = '\t'; break;
2767 case 'n': *p++ = '\n'; break;
2768 case 'r': *p++ = '\r'; break;
2769 case 'v': *p++ = '\013'; break; /* VT */
2770 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2771
2772 /* \OOO (octal) escapes */
2773 case '0': case '1': case '2': case '3':
2774 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002775 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002776 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002777 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002778 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002779 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002781 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 break;
2783
Fredrik Lundhccc74732001-02-18 22:13:49 +00002784 /* hex escapes */
2785 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002787 digits = 2;
2788 message = "truncated \\xXX escape";
2789 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790
Fredrik Lundhccc74732001-02-18 22:13:49 +00002791 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 digits = 4;
2794 message = "truncated \\uXXXX escape";
2795 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796
Fredrik Lundhccc74732001-02-18 22:13:49 +00002797 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002798 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002799 digits = 8;
2800 message = "truncated \\UXXXXXXXX escape";
2801 hexescape:
2802 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 outpos = p-PyUnicode_AS_UNICODE(v);
2804 if (s+digits>end) {
2805 endinpos = size;
2806 if (unicode_decode_call_errorhandler(
2807 errors, &errorHandler,
2808 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002809 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 (PyObject **)&v, &outpos, &p))
2811 goto onError;
2812 goto nextByte;
2813 }
2814 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002815 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002816 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 endinpos = (s+i+1)-starts;
2818 if (unicode_decode_call_errorhandler(
2819 errors, &errorHandler,
2820 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002821 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002823 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002825 }
2826 chr = (chr<<4) & ~0xF;
2827 if (c >= '0' && c <= '9')
2828 chr += c - '0';
2829 else if (c >= 'a' && c <= 'f')
2830 chr += 10 + c - 'a';
2831 else
2832 chr += 10 + c - 'A';
2833 }
2834 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002835 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 /* _decoding_error will have already written into the
2837 target buffer. */
2838 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002839 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002840 /* when we get here, chr is a 32-bit unicode character */
2841 if (chr <= 0xffff)
2842 /* UCS-2 character */
2843 *p++ = (Py_UNICODE) chr;
2844 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002845 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002846 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002847#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002848 *p++ = chr;
2849#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002850 chr -= 0x10000L;
2851 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002852 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002853#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002854 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 endinpos = s-starts;
2856 outpos = p-PyUnicode_AS_UNICODE(v);
2857 if (unicode_decode_call_errorhandler(
2858 errors, &errorHandler,
2859 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002860 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002862 goto onError;
2863 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002864 break;
2865
2866 /* \N{name} */
2867 case 'N':
2868 message = "malformed \\N character escape";
2869 if (ucnhash_CAPI == NULL) {
2870 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002871 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002872 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002873 if (m == NULL)
2874 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002875 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002876 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002877 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002879 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002880 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002881 if (ucnhash_CAPI == NULL)
2882 goto ucnhashError;
2883 }
2884 if (*s == '{') {
2885 const char *start = s+1;
2886 /* look for the closing brace */
2887 while (*s != '}' && s < end)
2888 s++;
2889 if (s > start && s < end && *s == '}') {
2890 /* found a name. look it up in the unicode database */
2891 message = "unknown Unicode character name";
2892 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002893 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002894 goto store;
2895 }
2896 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002897 endinpos = s-starts;
2898 outpos = p-PyUnicode_AS_UNICODE(v);
2899 if (unicode_decode_call_errorhandler(
2900 errors, &errorHandler,
2901 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002902 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002904 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002905 break;
2906
2907 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002908 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 message = "\\ at end of string";
2910 s--;
2911 endinpos = s-starts;
2912 outpos = p-PyUnicode_AS_UNICODE(v);
2913 if (unicode_decode_call_errorhandler(
2914 errors, &errorHandler,
2915 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002916 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002917 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002918 goto onError;
2919 }
2920 else {
2921 *p++ = '\\';
2922 *p++ = (unsigned char)s[-1];
2923 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002924 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002926 nextByte:
2927 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002929 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002930 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002931 Py_XDECREF(errorHandler);
2932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002934
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002936 PyErr_SetString(
2937 PyExc_UnicodeError,
2938 "\\N escapes not supported (can't load unicodedata module)"
2939 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002940 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002943 return NULL;
2944
Fredrik Lundhccc74732001-02-18 22:13:49 +00002945onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 Py_XDECREF(errorHandler);
2948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 return NULL;
2950}
2951
2952/* Return a Unicode-Escape string version of the Unicode object.
2953
2954 If quotes is true, the string is enclosed in u"" or u'' quotes as
2955 appropriate.
2956
2957*/
2958
Thomas Wouters477c8d52006-05-27 19:21:47 +00002959Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2960 Py_ssize_t size,
2961 Py_UNICODE ch)
2962{
2963 /* like wcschr, but doesn't stop at NULL characters */
2964
2965 while (size-- > 0) {
2966 if (*s == ch)
2967 return s;
2968 s++;
2969 }
2970
2971 return NULL;
2972}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002973
Walter Dörwald79e913e2007-05-12 11:08:06 +00002974static const char *hexdigits = "0123456789abcdef";
2975
2976PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2977 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002979 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981
Thomas Wouters89f507f2006-12-13 04:49:30 +00002982 /* XXX(nnorwitz): rather than over-allocating, it would be
2983 better to choose a different scheme. Perhaps scan the
2984 first N-chars of the string and allocate based on that size.
2985 */
2986 /* Initial allocation is based on the longest-possible unichr
2987 escape.
2988
2989 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2990 unichr, so in this case it's the longest unichr escape. In
2991 narrow (UTF-16) builds this is five chars per source unichr
2992 since there are two unichrs in the surrogate pair, so in narrow
2993 (UTF-16) builds it's not the longest unichr escape.
2994
2995 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2996 so in the narrow (UTF-16) build case it's the longest unichr
2997 escape.
2998 */
2999
Walter Dörwald79e913e2007-05-12 11:08:06 +00003000 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00003001#ifdef Py_UNICODE_WIDE
3002 + 10*size
3003#else
3004 + 6*size
3005#endif
3006 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 if (repr == NULL)
3008 return NULL;
3009
Walter Dörwald79e913e2007-05-12 11:08:06 +00003010 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 while (size-- > 0) {
3013 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003014
Walter Dörwald79e913e2007-05-12 11:08:06 +00003015 /* Escape backslashes */
3016 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 *p++ = '\\';
3018 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003019 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003020 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003021
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003022#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003023 /* Map 21-bit characters to '\U00xxxxxx' */
3024 else if (ch >= 0x10000) {
3025 *p++ = '\\';
3026 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003027 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3028 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3029 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3030 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3031 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3032 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3033 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3034 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003035 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003036 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003037#else
3038 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003039 else if (ch >= 0xD800 && ch < 0xDC00) {
3040 Py_UNICODE ch2;
3041 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003042
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003043 ch2 = *s++;
3044 size--;
3045 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3046 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3047 *p++ = '\\';
3048 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003049 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3050 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3051 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3052 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3053 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3054 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3055 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3056 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003057 continue;
3058 }
3059 /* Fall through: isolated surrogates are copied as-is */
3060 s--;
3061 size++;
3062 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003063#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003064
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003066 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 *p++ = '\\';
3068 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003069 *p++ = hexdigits[(ch >> 12) & 0x000F];
3070 *p++ = hexdigits[(ch >> 8) & 0x000F];
3071 *p++ = hexdigits[(ch >> 4) & 0x000F];
3072 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003074
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003075 /* Map special whitespace to '\t', \n', '\r' */
3076 else if (ch == '\t') {
3077 *p++ = '\\';
3078 *p++ = 't';
3079 }
3080 else if (ch == '\n') {
3081 *p++ = '\\';
3082 *p++ = 'n';
3083 }
3084 else if (ch == '\r') {
3085 *p++ = '\\';
3086 *p++ = 'r';
3087 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003088
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003089 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003090 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003092 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003093 *p++ = hexdigits[(ch >> 4) & 0x000F];
3094 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003095 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003096
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 /* Copy everything else as-is */
3098 else
3099 *p++ = (char) ch;
3100 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101
Guido van Rossum98297ee2007-11-06 21:34:58 +00003102 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3103 p - PyBytes_AS_STRING(repr));
3104 Py_DECREF(repr);
3105 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106}
3107
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3109{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003110 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 if (!PyUnicode_Check(unicode)) {
3112 PyErr_BadArgument();
3113 return NULL;
3114 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003115 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3116 PyUnicode_GET_SIZE(unicode));
3117
3118 if (!s)
3119 return NULL;
3120 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3121 PyBytes_GET_SIZE(s));
3122 Py_DECREF(s);
3123 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124}
3125
3126/* --- Raw Unicode Escape Codec ------------------------------------------- */
3127
3128PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003129 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 const char *errors)
3131{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003133 Py_ssize_t startinpos;
3134 Py_ssize_t endinpos;
3135 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003137 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 const char *end;
3139 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003140 PyObject *errorHandler = NULL;
3141 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003142
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 /* Escaped strings will always be longer than the resulting
3144 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003145 length after conversion to the true value. (But decoding error
3146 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 v = _PyUnicode_New(size);
3148 if (v == NULL)
3149 goto onError;
3150 if (size == 0)
3151 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003152 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 end = s + size;
3154 while (s < end) {
3155 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003156 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003158 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159
3160 /* Non-escape characters are interpreted as Unicode ordinals */
3161 if (*s != '\\') {
3162 *p++ = (unsigned char)*s++;
3163 continue;
3164 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166
3167 /* \u-escapes are only interpreted iff the number of leading
3168 backslashes if odd */
3169 bs = s;
3170 for (;s < end;) {
3171 if (*s != '\\')
3172 break;
3173 *p++ = (unsigned char)*s++;
3174 }
3175 if (((s - bs) & 1) == 0 ||
3176 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003177 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 continue;
3179 }
3180 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 s++;
3183
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003184 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003186 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003187 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003188 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003189 endinpos = s-starts;
3190 if (unicode_decode_call_errorhandler(
3191 errors, &errorHandler,
3192 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003193 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 }
3198 x = (x<<4) & ~0xF;
3199 if (c >= '0' && c <= '9')
3200 x += c - '0';
3201 else if (c >= 'a' && c <= 'f')
3202 x += 10 + c - 'a';
3203 else
3204 x += 10 + c - 'A';
3205 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003206 if (x <= 0xffff)
3207 /* UCS-2 character */
3208 *p++ = (Py_UNICODE) x;
3209 else if (x <= 0x10ffff) {
3210 /* UCS-4 character. Either store directly, or as
3211 surrogate pair. */
3212#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003213 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003214#else
3215 x -= 0x10000L;
3216 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3217 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3218#endif
3219 } else {
3220 endinpos = s-starts;
3221 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003222 if (unicode_decode_call_errorhandler(
3223 errors, &errorHandler,
3224 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003225 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003226 (PyObject **)&v, &outpos, &p))
3227 goto onError;
3228 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003229 nextByte:
3230 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003232 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003233 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 Py_XDECREF(errorHandler);
3235 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003237
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 onError:
3239 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 Py_XDECREF(errorHandler);
3241 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 return NULL;
3243}
3244
3245PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003246 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003248 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 char *p;
3250 char *q;
3251
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003252#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003253 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003254#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003255 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003256#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 if (repr == NULL)
3258 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003259 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003260 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261
Walter Dörwald711005d2007-05-12 12:03:26 +00003262 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 while (size-- > 0) {
3264 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003265#ifdef Py_UNICODE_WIDE
3266 /* Map 32-bit characters to '\Uxxxxxxxx' */
3267 if (ch >= 0x10000) {
3268 *p++ = '\\';
3269 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003270 *p++ = hexdigits[(ch >> 28) & 0xf];
3271 *p++ = hexdigits[(ch >> 24) & 0xf];
3272 *p++ = hexdigits[(ch >> 20) & 0xf];
3273 *p++ = hexdigits[(ch >> 16) & 0xf];
3274 *p++ = hexdigits[(ch >> 12) & 0xf];
3275 *p++ = hexdigits[(ch >> 8) & 0xf];
3276 *p++ = hexdigits[(ch >> 4) & 0xf];
3277 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003278 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003279 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003280#else
3281 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3282 if (ch >= 0xD800 && ch < 0xDC00) {
3283 Py_UNICODE ch2;
3284 Py_UCS4 ucs;
3285
3286 ch2 = *s++;
3287 size--;
3288 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3289 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3290 *p++ = '\\';
3291 *p++ = 'U';
3292 *p++ = hexdigits[(ucs >> 28) & 0xf];
3293 *p++ = hexdigits[(ucs >> 24) & 0xf];
3294 *p++ = hexdigits[(ucs >> 20) & 0xf];
3295 *p++ = hexdigits[(ucs >> 16) & 0xf];
3296 *p++ = hexdigits[(ucs >> 12) & 0xf];
3297 *p++ = hexdigits[(ucs >> 8) & 0xf];
3298 *p++ = hexdigits[(ucs >> 4) & 0xf];
3299 *p++ = hexdigits[ucs & 0xf];
3300 continue;
3301 }
3302 /* Fall through: isolated surrogates are copied as-is */
3303 s--;
3304 size++;
3305 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003306#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 /* Map 16-bit characters to '\uxxxx' */
3308 if (ch >= 256) {
3309 *p++ = '\\';
3310 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003311 *p++ = hexdigits[(ch >> 12) & 0xf];
3312 *p++ = hexdigits[(ch >> 8) & 0xf];
3313 *p++ = hexdigits[(ch >> 4) & 0xf];
3314 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 }
3316 /* Copy everything else as-is */
3317 else
3318 *p++ = (char) ch;
3319 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003320 size = p - q;
3321
3322 done:
3323 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3324 Py_DECREF(repr);
3325 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326}
3327
3328PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3329{
Walter Dörwald711005d2007-05-12 12:03:26 +00003330 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003332 PyErr_BadArgument();
3333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003335 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3336 PyUnicode_GET_SIZE(unicode));
3337
3338 if (!s)
3339 return NULL;
3340 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3341 PyBytes_GET_SIZE(s));
3342 Py_DECREF(s);
3343 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344}
3345
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003346/* --- Unicode Internal Codec ------------------------------------------- */
3347
3348PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003349 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003350 const char *errors)
3351{
3352 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003353 Py_ssize_t startinpos;
3354 Py_ssize_t endinpos;
3355 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003356 PyUnicodeObject *v;
3357 Py_UNICODE *p;
3358 const char *end;
3359 const char *reason;
3360 PyObject *errorHandler = NULL;
3361 PyObject *exc = NULL;
3362
Neal Norwitzd43069c2006-01-08 01:12:10 +00003363#ifdef Py_UNICODE_WIDE
3364 Py_UNICODE unimax = PyUnicode_GetMax();
3365#endif
3366
Thomas Wouters89f507f2006-12-13 04:49:30 +00003367 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003368 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3369 if (v == NULL)
3370 goto onError;
3371 if (PyUnicode_GetSize((PyObject *)v) == 0)
3372 return (PyObject *)v;
3373 p = PyUnicode_AS_UNICODE(v);
3374 end = s + size;
3375
3376 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003377 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003378 /* We have to sanity check the raw data, otherwise doom looms for
3379 some malformed UCS-4 data. */
3380 if (
3381 #ifdef Py_UNICODE_WIDE
3382 *p > unimax || *p < 0 ||
3383 #endif
3384 end-s < Py_UNICODE_SIZE
3385 )
3386 {
3387 startinpos = s - starts;
3388 if (end-s < Py_UNICODE_SIZE) {
3389 endinpos = end-starts;
3390 reason = "truncated input";
3391 }
3392 else {
3393 endinpos = s - starts + Py_UNICODE_SIZE;
3394 reason = "illegal code point (> 0x10FFFF)";
3395 }
3396 outpos = p - PyUnicode_AS_UNICODE(v);
3397 if (unicode_decode_call_errorhandler(
3398 errors, &errorHandler,
3399 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003400 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003401 (PyObject **)&v, &outpos, &p)) {
3402 goto onError;
3403 }
3404 }
3405 else {
3406 p++;
3407 s += Py_UNICODE_SIZE;
3408 }
3409 }
3410
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003411 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003412 goto onError;
3413 Py_XDECREF(errorHandler);
3414 Py_XDECREF(exc);
3415 return (PyObject *)v;
3416
3417 onError:
3418 Py_XDECREF(v);
3419 Py_XDECREF(errorHandler);
3420 Py_XDECREF(exc);
3421 return NULL;
3422}
3423
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424/* --- Latin-1 Codec ------------------------------------------------------ */
3425
3426PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003427 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 const char *errors)
3429{
3430 PyUnicodeObject *v;
3431 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003432
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003434 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003435 Py_UNICODE r = *(unsigned char*)s;
3436 return PyUnicode_FromUnicode(&r, 1);
3437 }
3438
Guido van Rossumd57fd912000-03-10 22:53:23 +00003439 v = _PyUnicode_New(size);
3440 if (v == NULL)
3441 goto onError;
3442 if (size == 0)
3443 return (PyObject *)v;
3444 p = PyUnicode_AS_UNICODE(v);
3445 while (size-- > 0)
3446 *p++ = (unsigned char)*s++;
3447 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003448
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 onError:
3450 Py_XDECREF(v);
3451 return NULL;
3452}
3453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454/* create or adjust a UnicodeEncodeError */
3455static void make_encode_exception(PyObject **exceptionObject,
3456 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003457 const Py_UNICODE *unicode, Py_ssize_t size,
3458 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 if (*exceptionObject == NULL) {
3462 *exceptionObject = PyUnicodeEncodeError_Create(
3463 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 }
3465 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3467 goto onError;
3468 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3469 goto onError;
3470 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3471 goto onError;
3472 return;
3473 onError:
3474 Py_DECREF(*exceptionObject);
3475 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 }
3477}
3478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479/* raises a UnicodeEncodeError */
3480static void raise_encode_exception(PyObject **exceptionObject,
3481 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003482 const Py_UNICODE *unicode, Py_ssize_t size,
3483 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 const char *reason)
3485{
3486 make_encode_exception(exceptionObject,
3487 encoding, unicode, size, startpos, endpos, reason);
3488 if (*exceptionObject != NULL)
3489 PyCodec_StrictErrors(*exceptionObject);
3490}
3491
3492/* error handling callback helper:
3493 build arguments, call the callback and check the arguments,
3494 put the result into newpos and return the replacement string, which
3495 has to be freed by the caller */
3496static PyObject *unicode_encode_call_errorhandler(const char *errors,
3497 PyObject **errorHandler,
3498 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003499 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3500 Py_ssize_t startpos, Py_ssize_t endpos,
3501 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003503 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504
3505 PyObject *restuple;
3506 PyObject *resunicode;
3507
3508 if (*errorHandler == NULL) {
3509 *errorHandler = PyCodec_LookupError(errors);
3510 if (*errorHandler == NULL)
3511 return NULL;
3512 }
3513
3514 make_encode_exception(exceptionObject,
3515 encoding, unicode, size, startpos, endpos, reason);
3516 if (*exceptionObject == NULL)
3517 return NULL;
3518
3519 restuple = PyObject_CallFunctionObjArgs(
3520 *errorHandler, *exceptionObject, NULL);
3521 if (restuple == NULL)
3522 return NULL;
3523 if (!PyTuple_Check(restuple)) {
3524 PyErr_Format(PyExc_TypeError, &argparse[4]);
3525 Py_DECREF(restuple);
3526 return NULL;
3527 }
3528 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3529 &resunicode, newpos)) {
3530 Py_DECREF(restuple);
3531 return NULL;
3532 }
3533 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003534 *newpos = size+*newpos;
3535 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003536 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003537 Py_DECREF(restuple);
3538 return NULL;
3539 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 Py_INCREF(resunicode);
3541 Py_DECREF(restuple);
3542 return resunicode;
3543}
3544
3545static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003546 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 const char *errors,
3548 int limit)
3549{
3550 /* output object */
3551 PyObject *res;
3552 /* pointers to the beginning and end+1 of input */
3553 const Py_UNICODE *startp = p;
3554 const Py_UNICODE *endp = p + size;
3555 /* pointer to the beginning of the unencodable characters */
3556 /* const Py_UNICODE *badp = NULL; */
3557 /* pointer into the output */
3558 char *str;
3559 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003560 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003561 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3562 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 PyObject *errorHandler = NULL;
3564 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003565 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 /* the following variable is used for caching string comparisons
3567 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3568 int known_errorHandler = -1;
3569
3570 /* allocate enough for a simple encoding without
3571 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003572 if (size == 0)
3573 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003574 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003576 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003577 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 ressize = size;
3579
3580 while (p<endp) {
3581 Py_UNICODE c = *p;
3582
3583 /* can we encode this? */
3584 if (c<limit) {
3585 /* no overflow check, because we know that the space is enough */
3586 *str++ = (char)c;
3587 ++p;
3588 }
3589 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003590 Py_ssize_t unicodepos = p-startp;
3591 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003593 Py_ssize_t repsize;
3594 Py_ssize_t newpos;
3595 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 Py_UNICODE *uni2;
3597 /* startpos for collecting unencodable chars */
3598 const Py_UNICODE *collstart = p;
3599 const Py_UNICODE *collend = p;
3600 /* find all unecodable characters */
3601 while ((collend < endp) && ((*collend)>=limit))
3602 ++collend;
3603 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3604 if (known_errorHandler==-1) {
3605 if ((errors==NULL) || (!strcmp(errors, "strict")))
3606 known_errorHandler = 1;
3607 else if (!strcmp(errors, "replace"))
3608 known_errorHandler = 2;
3609 else if (!strcmp(errors, "ignore"))
3610 known_errorHandler = 3;
3611 else if (!strcmp(errors, "xmlcharrefreplace"))
3612 known_errorHandler = 4;
3613 else
3614 known_errorHandler = 0;
3615 }
3616 switch (known_errorHandler) {
3617 case 1: /* strict */
3618 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3619 goto onError;
3620 case 2: /* replace */
3621 while (collstart++<collend)
3622 *str++ = '?'; /* fall through */
3623 case 3: /* ignore */
3624 p = collend;
3625 break;
3626 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003627 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 /* determine replacement size (temporarily (mis)uses p) */
3629 for (p = collstart, repsize = 0; p < collend; ++p) {
3630 if (*p<10)
3631 repsize += 2+1+1;
3632 else if (*p<100)
3633 repsize += 2+2+1;
3634 else if (*p<1000)
3635 repsize += 2+3+1;
3636 else if (*p<10000)
3637 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003638#ifndef Py_UNICODE_WIDE
3639 else
3640 repsize += 2+5+1;
3641#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 else if (*p<100000)
3643 repsize += 2+5+1;
3644 else if (*p<1000000)
3645 repsize += 2+6+1;
3646 else
3647 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003648#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 }
3650 requiredsize = respos+repsize+(endp-collend);
3651 if (requiredsize > ressize) {
3652 if (requiredsize<2*ressize)
3653 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003654 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003656 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 ressize = requiredsize;
3658 }
3659 /* generate replacement (temporarily (mis)uses p) */
3660 for (p = collstart; p < collend; ++p) {
3661 str += sprintf(str, "&#%d;", (int)*p);
3662 }
3663 p = collend;
3664 break;
3665 default:
3666 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3667 encoding, reason, startp, size, &exc,
3668 collstart-startp, collend-startp, &newpos);
3669 if (repunicode == NULL)
3670 goto onError;
3671 /* need more space? (at least enough for what we
3672 have+the replacement+the rest of the string, so
3673 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003674 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 repsize = PyUnicode_GET_SIZE(repunicode);
3676 requiredsize = respos+repsize+(endp-collend);
3677 if (requiredsize > ressize) {
3678 if (requiredsize<2*ressize)
3679 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003680 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 Py_DECREF(repunicode);
3682 goto onError;
3683 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003684 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 ressize = requiredsize;
3686 }
3687 /* check if there is anything unencodable in the replacement
3688 and copy it to the output */
3689 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3690 c = *uni2;
3691 if (c >= limit) {
3692 raise_encode_exception(&exc, encoding, startp, size,
3693 unicodepos, unicodepos+1, reason);
3694 Py_DECREF(repunicode);
3695 goto onError;
3696 }
3697 *str = (char)c;
3698 }
3699 p = startp + newpos;
3700 Py_DECREF(repunicode);
3701 }
3702 }
3703 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003704 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3705 str - PyBytes_AS_STRING(res));
3706 onError:
3707 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 Py_XDECREF(errorHandler);
3709 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003710 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711}
3712
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003714 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 const char *errors)
3716{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718}
3719
3720PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3721{
3722 if (!PyUnicode_Check(unicode)) {
3723 PyErr_BadArgument();
3724 return NULL;
3725 }
3726 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3727 PyUnicode_GET_SIZE(unicode),
3728 NULL);
3729}
3730
3731/* --- 7-bit ASCII Codec -------------------------------------------------- */
3732
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003734 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 const char *errors)
3736{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 PyUnicodeObject *v;
3739 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003740 Py_ssize_t startinpos;
3741 Py_ssize_t endinpos;
3742 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 const char *e;
3744 PyObject *errorHandler = NULL;
3745 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003746
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003748 if (size == 1 && *(unsigned char*)s < 128) {
3749 Py_UNICODE r = *(unsigned char*)s;
3750 return PyUnicode_FromUnicode(&r, 1);
3751 }
Tim Petersced69f82003-09-16 20:30:58 +00003752
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 v = _PyUnicode_New(size);
3754 if (v == NULL)
3755 goto onError;
3756 if (size == 0)
3757 return (PyObject *)v;
3758 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 e = s + size;
3760 while (s < e) {
3761 register unsigned char c = (unsigned char)*s;
3762 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 ++s;
3765 }
3766 else {
3767 startinpos = s-starts;
3768 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003769 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 if (unicode_decode_call_errorhandler(
3771 errors, &errorHandler,
3772 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003773 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003778 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003779 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003780 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 Py_XDECREF(errorHandler);
3782 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003784
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 onError:
3786 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 Py_XDECREF(errorHandler);
3788 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 return NULL;
3790}
3791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003793 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794 const char *errors)
3795{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797}
3798
3799PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3800{
3801 if (!PyUnicode_Check(unicode)) {
3802 PyErr_BadArgument();
3803 return NULL;
3804 }
3805 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3806 PyUnicode_GET_SIZE(unicode),
3807 NULL);
3808}
3809
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003810#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003811
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003812/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003813
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003814#if SIZEOF_INT < SIZEOF_SSIZE_T
3815#define NEED_RETRY
3816#endif
3817
3818/* XXX This code is limited to "true" double-byte encodings, as
3819 a) it assumes an incomplete character consists of a single byte, and
3820 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3821 encodings, see IsDBCSLeadByteEx documentation. */
3822
3823static int is_dbcs_lead_byte(const char *s, int offset)
3824{
3825 const char *curr = s + offset;
3826
3827 if (IsDBCSLeadByte(*curr)) {
3828 const char *prev = CharPrev(s, curr);
3829 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3830 }
3831 return 0;
3832}
3833
3834/*
3835 * Decode MBCS string into unicode object. If 'final' is set, converts
3836 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3837 */
3838static int decode_mbcs(PyUnicodeObject **v,
3839 const char *s, /* MBCS string */
3840 int size, /* sizeof MBCS string */
3841 int final)
3842{
3843 Py_UNICODE *p;
3844 Py_ssize_t n = 0;
3845 int usize = 0;
3846
3847 assert(size >= 0);
3848
3849 /* Skip trailing lead-byte unless 'final' is set */
3850 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3851 --size;
3852
3853 /* First get the size of the result */
3854 if (size > 0) {
3855 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3856 if (usize == 0) {
3857 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3858 return -1;
3859 }
3860 }
3861
3862 if (*v == NULL) {
3863 /* Create unicode object */
3864 *v = _PyUnicode_New(usize);
3865 if (*v == NULL)
3866 return -1;
3867 }
3868 else {
3869 /* Extend unicode object */
3870 n = PyUnicode_GET_SIZE(*v);
3871 if (_PyUnicode_Resize(v, n + usize) < 0)
3872 return -1;
3873 }
3874
3875 /* Do the conversion */
3876 if (size > 0) {
3877 p = PyUnicode_AS_UNICODE(*v) + n;
3878 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3879 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3880 return -1;
3881 }
3882 }
3883
3884 return size;
3885}
3886
3887PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3888 Py_ssize_t size,
3889 const char *errors,
3890 Py_ssize_t *consumed)
3891{
3892 PyUnicodeObject *v = NULL;
3893 int done;
3894
3895 if (consumed)
3896 *consumed = 0;
3897
3898#ifdef NEED_RETRY
3899 retry:
3900 if (size > INT_MAX)
3901 done = decode_mbcs(&v, s, INT_MAX, 0);
3902 else
3903#endif
3904 done = decode_mbcs(&v, s, (int)size, !consumed);
3905
3906 if (done < 0) {
3907 Py_XDECREF(v);
3908 return NULL;
3909 }
3910
3911 if (consumed)
3912 *consumed += done;
3913
3914#ifdef NEED_RETRY
3915 if (size > INT_MAX) {
3916 s += done;
3917 size -= done;
3918 goto retry;
3919 }
3920#endif
3921
3922 return (PyObject *)v;
3923}
3924
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003925PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003926 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003927 const char *errors)
3928{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003929 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3930}
3931
3932/*
3933 * Convert unicode into string object (MBCS).
3934 * Returns 0 if succeed, -1 otherwise.
3935 */
3936static int encode_mbcs(PyObject **repr,
3937 const Py_UNICODE *p, /* unicode */
3938 int size) /* size of unicode */
3939{
3940 int mbcssize = 0;
3941 Py_ssize_t n = 0;
3942
3943 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003944
3945 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003946 if (size > 0) {
3947 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3948 if (mbcssize == 0) {
3949 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3950 return -1;
3951 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003952 }
3953
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003954 if (*repr == NULL) {
3955 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003956 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003957 if (*repr == NULL)
3958 return -1;
3959 }
3960 else {
3961 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003962 n = PyString_Size(*repr);
3963 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003964 return -1;
3965 }
3966
3967 /* Do the conversion */
3968 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003969 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003970 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3971 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3972 return -1;
3973 }
3974 }
3975
3976 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003977}
3978
3979PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003981 const char *errors)
3982{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003983 PyObject *repr = NULL;
3984 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003985
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003986#ifdef NEED_RETRY
3987 retry:
3988 if (size > INT_MAX)
3989 ret = encode_mbcs(&repr, p, INT_MAX);
3990 else
3991#endif
3992 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003993
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003994 if (ret < 0) {
3995 Py_XDECREF(repr);
3996 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003997 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003998
3999#ifdef NEED_RETRY
4000 if (size > INT_MAX) {
4001 p += INT_MAX;
4002 size -= INT_MAX;
4003 goto retry;
4004 }
4005#endif
4006
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004007 return repr;
4008}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004009
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004010PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4011{
4012 if (!PyUnicode_Check(unicode)) {
4013 PyErr_BadArgument();
4014 return NULL;
4015 }
4016 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4017 PyUnicode_GET_SIZE(unicode),
4018 NULL);
4019}
4020
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004021#undef NEED_RETRY
4022
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004023#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004024
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025/* --- Character Mapping Codec -------------------------------------------- */
4026
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004028 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 PyObject *mapping,
4030 const char *errors)
4031{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004033 Py_ssize_t startinpos;
4034 Py_ssize_t endinpos;
4035 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 PyUnicodeObject *v;
4038 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004039 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 PyObject *errorHandler = NULL;
4041 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004042 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004043 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004044
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 /* Default to Latin-1 */
4046 if (mapping == NULL)
4047 return PyUnicode_DecodeLatin1(s, size, errors);
4048
4049 v = _PyUnicode_New(size);
4050 if (v == NULL)
4051 goto onError;
4052 if (size == 0)
4053 return (PyObject *)v;
4054 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004056 if (PyUnicode_CheckExact(mapping)) {
4057 mapstring = PyUnicode_AS_UNICODE(mapping);
4058 maplen = PyUnicode_GET_SIZE(mapping);
4059 while (s < e) {
4060 unsigned char ch = *s;
4061 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004063 if (ch < maplen)
4064 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004066 if (x == 0xfffe) {
4067 /* undefined mapping */
4068 outpos = p-PyUnicode_AS_UNICODE(v);
4069 startinpos = s-starts;
4070 endinpos = startinpos+1;
4071 if (unicode_decode_call_errorhandler(
4072 errors, &errorHandler,
4073 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004074 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004075 (PyObject **)&v, &outpos, &p)) {
4076 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004077 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004078 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004079 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004080 *p++ = x;
4081 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004083 }
4084 else {
4085 while (s < e) {
4086 unsigned char ch = *s;
4087 PyObject *w, *x;
4088
4089 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004090 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004091 if (w == NULL)
4092 goto onError;
4093 x = PyObject_GetItem(mapping, w);
4094 Py_DECREF(w);
4095 if (x == NULL) {
4096 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4097 /* No mapping found means: mapping is undefined. */
4098 PyErr_Clear();
4099 x = Py_None;
4100 Py_INCREF(x);
4101 } else
4102 goto onError;
4103 }
4104
4105 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004106 if (PyLong_Check(x)) {
4107 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004108 if (value < 0 || value > 65535) {
4109 PyErr_SetString(PyExc_TypeError,
4110 "character mapping must be in range(65536)");
4111 Py_DECREF(x);
4112 goto onError;
4113 }
4114 *p++ = (Py_UNICODE)value;
4115 }
4116 else if (x == Py_None) {
4117 /* undefined mapping */
4118 outpos = p-PyUnicode_AS_UNICODE(v);
4119 startinpos = s-starts;
4120 endinpos = startinpos+1;
4121 if (unicode_decode_call_errorhandler(
4122 errors, &errorHandler,
4123 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004124 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004125 (PyObject **)&v, &outpos, &p)) {
4126 Py_DECREF(x);
4127 goto onError;
4128 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004129 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004130 continue;
4131 }
4132 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004133 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004134
4135 if (targetsize == 1)
4136 /* 1-1 mapping */
4137 *p++ = *PyUnicode_AS_UNICODE(x);
4138
4139 else if (targetsize > 1) {
4140 /* 1-n mapping */
4141 if (targetsize > extrachars) {
4142 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004143 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4144 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004145 (targetsize << 2);
4146 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004147 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004148 if (_PyUnicode_Resize(&v,
4149 PyUnicode_GET_SIZE(v) + needed) < 0) {
4150 Py_DECREF(x);
4151 goto onError;
4152 }
4153 p = PyUnicode_AS_UNICODE(v) + oldpos;
4154 }
4155 Py_UNICODE_COPY(p,
4156 PyUnicode_AS_UNICODE(x),
4157 targetsize);
4158 p += targetsize;
4159 extrachars -= targetsize;
4160 }
4161 /* 1-0 mapping: skip the character */
4162 }
4163 else {
4164 /* wrong return value */
4165 PyErr_SetString(PyExc_TypeError,
4166 "character mapping must return integer, None or unicode");
4167 Py_DECREF(x);
4168 goto onError;
4169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004171 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 }
4174 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004175 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 Py_XDECREF(errorHandler);
4178 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004180
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 Py_XDECREF(errorHandler);
4183 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 Py_XDECREF(v);
4185 return NULL;
4186}
4187
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004188/* Charmap encoding: the lookup table */
4189
4190struct encoding_map{
4191 PyObject_HEAD
4192 unsigned char level1[32];
4193 int count2, count3;
4194 unsigned char level23[1];
4195};
4196
4197static PyObject*
4198encoding_map_size(PyObject *obj, PyObject* args)
4199{
4200 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004201 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004202 128*map->count3);
4203}
4204
4205static PyMethodDef encoding_map_methods[] = {
4206 {"size", encoding_map_size, METH_NOARGS,
4207 PyDoc_STR("Return the size (in bytes) of this object") },
4208 { 0 }
4209};
4210
4211static void
4212encoding_map_dealloc(PyObject* o)
4213{
4214 PyObject_FREE(o);
4215}
4216
4217static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004218 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004219 "EncodingMap", /*tp_name*/
4220 sizeof(struct encoding_map), /*tp_basicsize*/
4221 0, /*tp_itemsize*/
4222 /* methods */
4223 encoding_map_dealloc, /*tp_dealloc*/
4224 0, /*tp_print*/
4225 0, /*tp_getattr*/
4226 0, /*tp_setattr*/
4227 0, /*tp_compare*/
4228 0, /*tp_repr*/
4229 0, /*tp_as_number*/
4230 0, /*tp_as_sequence*/
4231 0, /*tp_as_mapping*/
4232 0, /*tp_hash*/
4233 0, /*tp_call*/
4234 0, /*tp_str*/
4235 0, /*tp_getattro*/
4236 0, /*tp_setattro*/
4237 0, /*tp_as_buffer*/
4238 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4239 0, /*tp_doc*/
4240 0, /*tp_traverse*/
4241 0, /*tp_clear*/
4242 0, /*tp_richcompare*/
4243 0, /*tp_weaklistoffset*/
4244 0, /*tp_iter*/
4245 0, /*tp_iternext*/
4246 encoding_map_methods, /*tp_methods*/
4247 0, /*tp_members*/
4248 0, /*tp_getset*/
4249 0, /*tp_base*/
4250 0, /*tp_dict*/
4251 0, /*tp_descr_get*/
4252 0, /*tp_descr_set*/
4253 0, /*tp_dictoffset*/
4254 0, /*tp_init*/
4255 0, /*tp_alloc*/
4256 0, /*tp_new*/
4257 0, /*tp_free*/
4258 0, /*tp_is_gc*/
4259};
4260
4261PyObject*
4262PyUnicode_BuildEncodingMap(PyObject* string)
4263{
4264 Py_UNICODE *decode;
4265 PyObject *result;
4266 struct encoding_map *mresult;
4267 int i;
4268 int need_dict = 0;
4269 unsigned char level1[32];
4270 unsigned char level2[512];
4271 unsigned char *mlevel1, *mlevel2, *mlevel3;
4272 int count2 = 0, count3 = 0;
4273
4274 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4275 PyErr_BadArgument();
4276 return NULL;
4277 }
4278 decode = PyUnicode_AS_UNICODE(string);
4279 memset(level1, 0xFF, sizeof level1);
4280 memset(level2, 0xFF, sizeof level2);
4281
4282 /* If there isn't a one-to-one mapping of NULL to \0,
4283 or if there are non-BMP characters, we need to use
4284 a mapping dictionary. */
4285 if (decode[0] != 0)
4286 need_dict = 1;
4287 for (i = 1; i < 256; i++) {
4288 int l1, l2;
4289 if (decode[i] == 0
4290 #ifdef Py_UNICODE_WIDE
4291 || decode[i] > 0xFFFF
4292 #endif
4293 ) {
4294 need_dict = 1;
4295 break;
4296 }
4297 if (decode[i] == 0xFFFE)
4298 /* unmapped character */
4299 continue;
4300 l1 = decode[i] >> 11;
4301 l2 = decode[i] >> 7;
4302 if (level1[l1] == 0xFF)
4303 level1[l1] = count2++;
4304 if (level2[l2] == 0xFF)
4305 level2[l2] = count3++;
4306 }
4307
4308 if (count2 >= 0xFF || count3 >= 0xFF)
4309 need_dict = 1;
4310
4311 if (need_dict) {
4312 PyObject *result = PyDict_New();
4313 PyObject *key, *value;
4314 if (!result)
4315 return NULL;
4316 for (i = 0; i < 256; i++) {
4317 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004318 key = PyLong_FromLong(decode[i]);
4319 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004320 if (!key || !value)
4321 goto failed1;
4322 if (PyDict_SetItem(result, key, value) == -1)
4323 goto failed1;
4324 Py_DECREF(key);
4325 Py_DECREF(value);
4326 }
4327 return result;
4328 failed1:
4329 Py_XDECREF(key);
4330 Py_XDECREF(value);
4331 Py_DECREF(result);
4332 return NULL;
4333 }
4334
4335 /* Create a three-level trie */
4336 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4337 16*count2 + 128*count3 - 1);
4338 if (!result)
4339 return PyErr_NoMemory();
4340 PyObject_Init(result, &EncodingMapType);
4341 mresult = (struct encoding_map*)result;
4342 mresult->count2 = count2;
4343 mresult->count3 = count3;
4344 mlevel1 = mresult->level1;
4345 mlevel2 = mresult->level23;
4346 mlevel3 = mresult->level23 + 16*count2;
4347 memcpy(mlevel1, level1, 32);
4348 memset(mlevel2, 0xFF, 16*count2);
4349 memset(mlevel3, 0, 128*count3);
4350 count3 = 0;
4351 for (i = 1; i < 256; i++) {
4352 int o1, o2, o3, i2, i3;
4353 if (decode[i] == 0xFFFE)
4354 /* unmapped character */
4355 continue;
4356 o1 = decode[i]>>11;
4357 o2 = (decode[i]>>7) & 0xF;
4358 i2 = 16*mlevel1[o1] + o2;
4359 if (mlevel2[i2] == 0xFF)
4360 mlevel2[i2] = count3++;
4361 o3 = decode[i] & 0x7F;
4362 i3 = 128*mlevel2[i2] + o3;
4363 mlevel3[i3] = i;
4364 }
4365 return result;
4366}
4367
4368static int
4369encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4370{
4371 struct encoding_map *map = (struct encoding_map*)mapping;
4372 int l1 = c>>11;
4373 int l2 = (c>>7) & 0xF;
4374 int l3 = c & 0x7F;
4375 int i;
4376
4377#ifdef Py_UNICODE_WIDE
4378 if (c > 0xFFFF) {
4379 return -1;
4380 }
4381#endif
4382 if (c == 0)
4383 return 0;
4384 /* level 1*/
4385 i = map->level1[l1];
4386 if (i == 0xFF) {
4387 return -1;
4388 }
4389 /* level 2*/
4390 i = map->level23[16*i+l2];
4391 if (i == 0xFF) {
4392 return -1;
4393 }
4394 /* level 3 */
4395 i = map->level23[16*map->count2 + 128*i + l3];
4396 if (i == 0) {
4397 return -1;
4398 }
4399 return i;
4400}
4401
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402/* Lookup the character ch in the mapping. If the character
4403 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004404 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406{
Christian Heimes217cfd12007-12-02 14:31:20 +00004407 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 PyObject *x;
4409
4410 if (w == NULL)
4411 return NULL;
4412 x = PyObject_GetItem(mapping, w);
4413 Py_DECREF(w);
4414 if (x == NULL) {
4415 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4416 /* No mapping found means: mapping is undefined. */
4417 PyErr_Clear();
4418 x = Py_None;
4419 Py_INCREF(x);
4420 return x;
4421 } else
4422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004424 else if (x == Py_None)
4425 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004426 else if (PyLong_Check(x)) {
4427 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 if (value < 0 || value > 255) {
4429 PyErr_SetString(PyExc_TypeError,
4430 "character mapping must be in range(256)");
4431 Py_DECREF(x);
4432 return NULL;
4433 }
4434 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 else if (PyString_Check(x))
4437 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004440 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004441 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004442 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 Py_DECREF(x);
4444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 }
4446}
4447
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004448static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004449charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004450{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004451 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004452 /* exponentially overallocate to minimize reallocations */
4453 if (requiredsize < 2*outsize)
4454 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004455 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004456 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004457 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004458}
4459
4460typedef enum charmapencode_result {
4461 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4462}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004464 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 space is available. Return a new reference to the object that
4466 was put in the output buffer, or Py_None, if the mapping was undefined
4467 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004468 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004470charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004471 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004473 PyObject *rep;
4474 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004475 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476
Christian Heimes90aa7642007-12-19 02:45:37 +00004477 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004478 int res = encoding_map_lookup(c, mapping);
4479 Py_ssize_t requiredsize = *outpos+1;
4480 if (res == -1)
4481 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004482 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004483 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004484 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004485 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004486 outstart[(*outpos)++] = (char)res;
4487 return enc_SUCCESS;
4488 }
4489
4490 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004492 return enc_EXCEPTION;
4493 else if (rep==Py_None) {
4494 Py_DECREF(rep);
4495 return enc_FAILED;
4496 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004497 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004498 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004499 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004500 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004502 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004504 outstart = PyString_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004505 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 }
4507 else {
4508 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004509 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4510 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004511 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004512 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004514 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004516 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 memcpy(outstart + *outpos, repchars, repsize);
4518 *outpos += repsize;
4519 }
4520 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004521 Py_DECREF(rep);
4522 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523}
4524
4525/* handle an error in PyUnicode_EncodeCharmap
4526 Return 0 on success, -1 on error */
4527static
4528int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004529 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004531 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004532 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533{
4534 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004535 Py_ssize_t repsize;
4536 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 Py_UNICODE *uni2;
4538 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004539 Py_ssize_t collstartpos = *inpos;
4540 Py_ssize_t collendpos = *inpos+1;
4541 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 char *encoding = "charmap";
4543 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004544 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 /* find all unencodable characters */
4547 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004548 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004549 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004550 int res = encoding_map_lookup(p[collendpos], mapping);
4551 if (res != -1)
4552 break;
4553 ++collendpos;
4554 continue;
4555 }
4556
4557 rep = charmapencode_lookup(p[collendpos], mapping);
4558 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004560 else if (rep!=Py_None) {
4561 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 break;
4563 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004564 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 ++collendpos;
4566 }
4567 /* cache callback name lookup
4568 * (if not done yet, i.e. it's the first error) */
4569 if (*known_errorHandler==-1) {
4570 if ((errors==NULL) || (!strcmp(errors, "strict")))
4571 *known_errorHandler = 1;
4572 else if (!strcmp(errors, "replace"))
4573 *known_errorHandler = 2;
4574 else if (!strcmp(errors, "ignore"))
4575 *known_errorHandler = 3;
4576 else if (!strcmp(errors, "xmlcharrefreplace"))
4577 *known_errorHandler = 4;
4578 else
4579 *known_errorHandler = 0;
4580 }
4581 switch (*known_errorHandler) {
4582 case 1: /* strict */
4583 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4584 return -1;
4585 case 2: /* replace */
4586 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4587 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004588 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 return -1;
4590 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004591 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4593 return -1;
4594 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595 }
4596 /* fall through */
4597 case 3: /* ignore */
4598 *inpos = collendpos;
4599 break;
4600 case 4: /* xmlcharrefreplace */
4601 /* generate replacement (temporarily (mis)uses p) */
4602 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4603 char buffer[2+29+1+1];
4604 char *cp;
4605 sprintf(buffer, "&#%d;", (int)p[collpos]);
4606 for (cp = buffer; *cp; ++cp) {
4607 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004608 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004610 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4612 return -1;
4613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 }
4615 }
4616 *inpos = collendpos;
4617 break;
4618 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004619 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 encoding, reason, p, size, exceptionObject,
4621 collstartpos, collendpos, &newpos);
4622 if (repunicode == NULL)
4623 return -1;
4624 /* generate replacement */
4625 repsize = PyUnicode_GET_SIZE(repunicode);
4626 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4627 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004628 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 return -1;
4630 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004631 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4634 return -1;
4635 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 }
4637 *inpos = newpos;
4638 Py_DECREF(repunicode);
4639 }
4640 return 0;
4641}
4642
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 PyObject *mapping,
4646 const char *errors)
4647{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 /* output object */
4649 PyObject *res = NULL;
4650 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654 PyObject *errorHandler = NULL;
4655 PyObject *exc = NULL;
4656 /* the following variable is used for caching string comparisons
4657 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4658 * 3=ignore, 4=xmlcharrefreplace */
4659 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660
4661 /* Default to Latin-1 */
4662 if (mapping == NULL)
4663 return PyUnicode_EncodeLatin1(p, size, errors);
4664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 /* allocate enough for a simple encoding without
4666 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004667 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 if (res == NULL)
4669 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004670 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004671 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 while (inpos<size) {
4674 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004675 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004676 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004678 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 if (charmap_encoding_error(p, size, &inpos, mapping,
4680 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004681 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004682 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004683 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 else
4687 /* done with this character => adjust input position */
4688 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004692 if (respos<PyString_GET_SIZE(res))
4693 _PyString_Resize(&res, respos);
4694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 Py_XDECREF(exc);
4696 Py_XDECREF(errorHandler);
4697 return res;
4698
4699 onError:
4700 Py_XDECREF(res);
4701 Py_XDECREF(exc);
4702 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 return NULL;
4704}
4705
4706PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4707 PyObject *mapping)
4708{
4709 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4710 PyErr_BadArgument();
4711 return NULL;
4712 }
4713 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4714 PyUnicode_GET_SIZE(unicode),
4715 mapping,
4716 NULL);
4717}
4718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719/* create or adjust a UnicodeTranslateError */
4720static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004721 const Py_UNICODE *unicode, Py_ssize_t size,
4722 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004725 if (*exceptionObject == NULL) {
4726 *exceptionObject = PyUnicodeTranslateError_Create(
4727 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 }
4729 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4731 goto onError;
4732 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4733 goto onError;
4734 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4735 goto onError;
4736 return;
4737 onError:
4738 Py_DECREF(*exceptionObject);
4739 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 }
4741}
4742
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743/* raises a UnicodeTranslateError */
4744static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004745 const Py_UNICODE *unicode, Py_ssize_t size,
4746 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747 const char *reason)
4748{
4749 make_translate_exception(exceptionObject,
4750 unicode, size, startpos, endpos, reason);
4751 if (*exceptionObject != NULL)
4752 PyCodec_StrictErrors(*exceptionObject);
4753}
4754
4755/* error handling callback helper:
4756 build arguments, call the callback and check the arguments,
4757 put the result into newpos and return the replacement string, which
4758 has to be freed by the caller */
4759static PyObject *unicode_translate_call_errorhandler(const char *errors,
4760 PyObject **errorHandler,
4761 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004762 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4763 Py_ssize_t startpos, Py_ssize_t endpos,
4764 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004766 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004768 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 PyObject *restuple;
4770 PyObject *resunicode;
4771
4772 if (*errorHandler == NULL) {
4773 *errorHandler = PyCodec_LookupError(errors);
4774 if (*errorHandler == NULL)
4775 return NULL;
4776 }
4777
4778 make_translate_exception(exceptionObject,
4779 unicode, size, startpos, endpos, reason);
4780 if (*exceptionObject == NULL)
4781 return NULL;
4782
4783 restuple = PyObject_CallFunctionObjArgs(
4784 *errorHandler, *exceptionObject, NULL);
4785 if (restuple == NULL)
4786 return NULL;
4787 if (!PyTuple_Check(restuple)) {
4788 PyErr_Format(PyExc_TypeError, &argparse[4]);
4789 Py_DECREF(restuple);
4790 return NULL;
4791 }
4792 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004793 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794 Py_DECREF(restuple);
4795 return NULL;
4796 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004797 if (i_newpos<0)
4798 *newpos = size+i_newpos;
4799 else
4800 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004801 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004802 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004803 Py_DECREF(restuple);
4804 return NULL;
4805 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806 Py_INCREF(resunicode);
4807 Py_DECREF(restuple);
4808 return resunicode;
4809}
4810
4811/* Lookup the character ch in the mapping and put the result in result,
4812 which must be decrefed by the caller.
4813 Return 0 on success, -1 on error */
4814static
4815int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4816{
Christian Heimes217cfd12007-12-02 14:31:20 +00004817 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 PyObject *x;
4819
4820 if (w == NULL)
4821 return -1;
4822 x = PyObject_GetItem(mapping, w);
4823 Py_DECREF(w);
4824 if (x == NULL) {
4825 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4826 /* No mapping found means: use 1:1 mapping. */
4827 PyErr_Clear();
4828 *result = NULL;
4829 return 0;
4830 } else
4831 return -1;
4832 }
4833 else if (x == Py_None) {
4834 *result = x;
4835 return 0;
4836 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004837 else if (PyLong_Check(x)) {
4838 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 long max = PyUnicode_GetMax();
4840 if (value < 0 || value > max) {
4841 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004842 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 Py_DECREF(x);
4844 return -1;
4845 }
4846 *result = x;
4847 return 0;
4848 }
4849 else if (PyUnicode_Check(x)) {
4850 *result = x;
4851 return 0;
4852 }
4853 else {
4854 /* wrong return value */
4855 PyErr_SetString(PyExc_TypeError,
4856 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004857 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 return -1;
4859 }
4860}
4861/* ensure that *outobj is at least requiredsize characters long,
4862if not reallocate and adjust various state variables.
4863Return 0 on success, -1 on error */
4864static
Walter Dörwald4894c302003-10-24 14:25:28 +00004865int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004866 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004868 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004869 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004871 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004873 if (requiredsize < 2 * oldsize)
4874 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004875 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 return -1;
4877 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 }
4879 return 0;
4880}
4881/* lookup the character, put the result in the output string and adjust
4882 various state variables. Return a new reference to the object that
4883 was put in the output buffer in *result, or Py_None, if the mapping was
4884 undefined (in which case no character was written).
4885 The called must decref result.
4886 Return 0 on success, -1 on error. */
4887static
Walter Dörwald4894c302003-10-24 14:25:28 +00004888int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004889 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004890 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891{
Walter Dörwald4894c302003-10-24 14:25:28 +00004892 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 return -1;
4894 if (*res==NULL) {
4895 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004896 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 }
4898 else if (*res==Py_None)
4899 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00004900 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00004902 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 }
4904 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004905 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004906 if (repsize==1) {
4907 /* no overflow check, because we know that the space is enough */
4908 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4909 }
4910 else if (repsize!=0) {
4911 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004912 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004913 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004914 repsize - 1;
4915 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 return -1;
4917 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4918 *outp += repsize;
4919 }
4920 }
4921 else
4922 return -1;
4923 return 0;
4924}
4925
4926PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004927 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928 PyObject *mapping,
4929 const char *errors)
4930{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004931 /* output object */
4932 PyObject *res = NULL;
4933 /* pointers to the beginning and end+1 of input */
4934 const Py_UNICODE *startp = p;
4935 const Py_UNICODE *endp = p + size;
4936 /* pointer into the output */
4937 Py_UNICODE *str;
4938 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004939 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004940 char *reason = "character maps to <undefined>";
4941 PyObject *errorHandler = NULL;
4942 PyObject *exc = NULL;
4943 /* the following variable is used for caching string comparisons
4944 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4945 * 3=ignore, 4=xmlcharrefreplace */
4946 int known_errorHandler = -1;
4947
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 if (mapping == NULL) {
4949 PyErr_BadArgument();
4950 return NULL;
4951 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952
4953 /* allocate enough for a simple 1:1 translation without
4954 replacements, if we need more, we'll resize */
4955 res = PyUnicode_FromUnicode(NULL, size);
4956 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 return res;
4960 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004962 while (p<endp) {
4963 /* try to encode it */
4964 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004965 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 goto onError;
4968 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004969 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 if (x!=Py_None) /* it worked => adjust input pointer */
4971 ++p;
4972 else { /* untranslatable character */
4973 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004974 Py_ssize_t repsize;
4975 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 Py_UNICODE *uni2;
4977 /* startpos for collecting untranslatable chars */
4978 const Py_UNICODE *collstart = p;
4979 const Py_UNICODE *collend = p+1;
4980 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004982 /* find all untranslatable characters */
4983 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004984 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985 goto onError;
4986 Py_XDECREF(x);
4987 if (x!=Py_None)
4988 break;
4989 ++collend;
4990 }
4991 /* cache callback name lookup
4992 * (if not done yet, i.e. it's the first error) */
4993 if (known_errorHandler==-1) {
4994 if ((errors==NULL) || (!strcmp(errors, "strict")))
4995 known_errorHandler = 1;
4996 else if (!strcmp(errors, "replace"))
4997 known_errorHandler = 2;
4998 else if (!strcmp(errors, "ignore"))
4999 known_errorHandler = 3;
5000 else if (!strcmp(errors, "xmlcharrefreplace"))
5001 known_errorHandler = 4;
5002 else
5003 known_errorHandler = 0;
5004 }
5005 switch (known_errorHandler) {
5006 case 1: /* strict */
5007 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5008 goto onError;
5009 case 2: /* replace */
5010 /* No need to check for space, this is a 1:1 replacement */
5011 for (coll = collstart; coll<collend; ++coll)
5012 *str++ = '?';
5013 /* fall through */
5014 case 3: /* ignore */
5015 p = collend;
5016 break;
5017 case 4: /* xmlcharrefreplace */
5018 /* generate replacement (temporarily (mis)uses p) */
5019 for (p = collstart; p < collend; ++p) {
5020 char buffer[2+29+1+1];
5021 char *cp;
5022 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005023 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5025 goto onError;
5026 for (cp = buffer; *cp; ++cp)
5027 *str++ = *cp;
5028 }
5029 p = collend;
5030 break;
5031 default:
5032 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5033 reason, startp, size, &exc,
5034 collstart-startp, collend-startp, &newpos);
5035 if (repunicode == NULL)
5036 goto onError;
5037 /* generate replacement */
5038 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005039 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5041 Py_DECREF(repunicode);
5042 goto onError;
5043 }
5044 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5045 *str++ = *uni2;
5046 p = startp + newpos;
5047 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048 }
5049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005051 /* Resize if we allocated to much */
5052 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005053 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005054 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005055 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 }
5057 Py_XDECREF(exc);
5058 Py_XDECREF(errorHandler);
5059 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061 onError:
5062 Py_XDECREF(res);
5063 Py_XDECREF(exc);
5064 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 return NULL;
5066}
5067
5068PyObject *PyUnicode_Translate(PyObject *str,
5069 PyObject *mapping,
5070 const char *errors)
5071{
5072 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005073
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 str = PyUnicode_FromObject(str);
5075 if (str == NULL)
5076 goto onError;
5077 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5078 PyUnicode_GET_SIZE(str),
5079 mapping,
5080 errors);
5081 Py_DECREF(str);
5082 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005083
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 onError:
5085 Py_XDECREF(str);
5086 return NULL;
5087}
Tim Petersced69f82003-09-16 20:30:58 +00005088
Guido van Rossum9e896b32000-04-05 20:11:21 +00005089/* --- Decimal Encoder ---------------------------------------------------- */
5090
5091int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005092 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005093 char *output,
5094 const char *errors)
5095{
5096 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005097 PyObject *errorHandler = NULL;
5098 PyObject *exc = NULL;
5099 const char *encoding = "decimal";
5100 const char *reason = "invalid decimal Unicode string";
5101 /* the following variable is used for caching string comparisons
5102 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5103 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005104
5105 if (output == NULL) {
5106 PyErr_BadArgument();
5107 return -1;
5108 }
5109
5110 p = s;
5111 end = s + length;
5112 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005114 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005116 Py_ssize_t repsize;
5117 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118 Py_UNICODE *uni2;
5119 Py_UNICODE *collstart;
5120 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005121
Guido van Rossum9e896b32000-04-05 20:11:21 +00005122 if (Py_UNICODE_ISSPACE(ch)) {
5123 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005124 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005125 continue;
5126 }
5127 decimal = Py_UNICODE_TODECIMAL(ch);
5128 if (decimal >= 0) {
5129 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005130 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005131 continue;
5132 }
Guido van Rossumba477042000-04-06 18:18:10 +00005133 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005134 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005135 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005136 continue;
5137 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 /* All other characters are considered unencodable */
5139 collstart = p;
5140 collend = p+1;
5141 while (collend < end) {
5142 if ((0 < *collend && *collend < 256) ||
5143 !Py_UNICODE_ISSPACE(*collend) ||
5144 Py_UNICODE_TODECIMAL(*collend))
5145 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005146 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005147 /* cache callback name lookup
5148 * (if not done yet, i.e. it's the first error) */
5149 if (known_errorHandler==-1) {
5150 if ((errors==NULL) || (!strcmp(errors, "strict")))
5151 known_errorHandler = 1;
5152 else if (!strcmp(errors, "replace"))
5153 known_errorHandler = 2;
5154 else if (!strcmp(errors, "ignore"))
5155 known_errorHandler = 3;
5156 else if (!strcmp(errors, "xmlcharrefreplace"))
5157 known_errorHandler = 4;
5158 else
5159 known_errorHandler = 0;
5160 }
5161 switch (known_errorHandler) {
5162 case 1: /* strict */
5163 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5164 goto onError;
5165 case 2: /* replace */
5166 for (p = collstart; p < collend; ++p)
5167 *output++ = '?';
5168 /* fall through */
5169 case 3: /* ignore */
5170 p = collend;
5171 break;
5172 case 4: /* xmlcharrefreplace */
5173 /* generate replacement (temporarily (mis)uses p) */
5174 for (p = collstart; p < collend; ++p)
5175 output += sprintf(output, "&#%d;", (int)*p);
5176 p = collend;
5177 break;
5178 default:
5179 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5180 encoding, reason, s, length, &exc,
5181 collstart-s, collend-s, &newpos);
5182 if (repunicode == NULL)
5183 goto onError;
5184 /* generate replacement */
5185 repsize = PyUnicode_GET_SIZE(repunicode);
5186 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5187 Py_UNICODE ch = *uni2;
5188 if (Py_UNICODE_ISSPACE(ch))
5189 *output++ = ' ';
5190 else {
5191 decimal = Py_UNICODE_TODECIMAL(ch);
5192 if (decimal >= 0)
5193 *output++ = '0' + decimal;
5194 else if (0 < ch && ch < 256)
5195 *output++ = (char)ch;
5196 else {
5197 Py_DECREF(repunicode);
5198 raise_encode_exception(&exc, encoding,
5199 s, length, collstart-s, collend-s, reason);
5200 goto onError;
5201 }
5202 }
5203 }
5204 p = s + newpos;
5205 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005206 }
5207 }
5208 /* 0-terminate the output string */
5209 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210 Py_XDECREF(exc);
5211 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005212 return 0;
5213
5214 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005215 Py_XDECREF(exc);
5216 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005217 return -1;
5218}
5219
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220/* --- Helpers ------------------------------------------------------------ */
5221
Eric Smith8c663262007-08-25 02:26:07 +00005222#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005223#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005224#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005225/* Include _ParseTupleFinds from find.h */
5226#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005227#include "stringlib/find.h"
5228#include "stringlib/partition.h"
5229
5230/* helper macro to fixup start/end slice values */
5231#define FIX_START_END(obj) \
5232 if (start < 0) \
5233 start += (obj)->length; \
5234 if (start < 0) \
5235 start = 0; \
5236 if (end > (obj)->length) \
5237 end = (obj)->length; \
5238 if (end < 0) \
5239 end += (obj)->length; \
5240 if (end < 0) \
5241 end = 0;
5242
Martin v. Löwis18e16552006-02-15 17:27:45 +00005243Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005244 PyObject *substr,
5245 Py_ssize_t start,
5246 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005248 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005249 PyUnicodeObject* str_obj;
5250 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005251
Thomas Wouters477c8d52006-05-27 19:21:47 +00005252 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5253 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005255 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5256 if (!sub_obj) {
5257 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 return -1;
5259 }
Tim Petersced69f82003-09-16 20:30:58 +00005260
Thomas Wouters477c8d52006-05-27 19:21:47 +00005261 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005262
Thomas Wouters477c8d52006-05-27 19:21:47 +00005263 result = stringlib_count(
5264 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5265 );
5266
5267 Py_DECREF(sub_obj);
5268 Py_DECREF(str_obj);
5269
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 return result;
5271}
5272
Martin v. Löwis18e16552006-02-15 17:27:45 +00005273Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005274 PyObject *sub,
5275 Py_ssize_t start,
5276 Py_ssize_t end,
5277 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005279 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005280
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005282 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005283 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005284 sub = PyUnicode_FromObject(sub);
5285 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005286 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005287 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 }
Tim Petersced69f82003-09-16 20:30:58 +00005289
Thomas Wouters477c8d52006-05-27 19:21:47 +00005290 if (direction > 0)
5291 result = stringlib_find_slice(
5292 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5293 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5294 start, end
5295 );
5296 else
5297 result = stringlib_rfind_slice(
5298 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5299 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5300 start, end
5301 );
5302
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005304 Py_DECREF(sub);
5305
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 return result;
5307}
5308
Tim Petersced69f82003-09-16 20:30:58 +00005309static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310int tailmatch(PyUnicodeObject *self,
5311 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005312 Py_ssize_t start,
5313 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 int direction)
5315{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 if (substring->length == 0)
5317 return 1;
5318
Thomas Wouters477c8d52006-05-27 19:21:47 +00005319 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320
5321 end -= substring->length;
5322 if (end < start)
5323 return 0;
5324
5325 if (direction > 0) {
5326 if (Py_UNICODE_MATCH(self, end, substring))
5327 return 1;
5328 } else {
5329 if (Py_UNICODE_MATCH(self, start, substring))
5330 return 1;
5331 }
5332
5333 return 0;
5334}
5335
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 Py_ssize_t start,
5339 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 int direction)
5341{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005342 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005343
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 str = PyUnicode_FromObject(str);
5345 if (str == NULL)
5346 return -1;
5347 substr = PyUnicode_FromObject(substr);
5348 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005349 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 return -1;
5351 }
Tim Petersced69f82003-09-16 20:30:58 +00005352
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 result = tailmatch((PyUnicodeObject *)str,
5354 (PyUnicodeObject *)substr,
5355 start, end, direction);
5356 Py_DECREF(str);
5357 Py_DECREF(substr);
5358 return result;
5359}
5360
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361/* Apply fixfct filter to the Unicode object self and return a
5362 reference to the modified object */
5363
Tim Petersced69f82003-09-16 20:30:58 +00005364static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365PyObject *fixup(PyUnicodeObject *self,
5366 int (*fixfct)(PyUnicodeObject *s))
5367{
5368
5369 PyUnicodeObject *u;
5370
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005371 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 if (u == NULL)
5373 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005374
5375 Py_UNICODE_COPY(u->str, self->str, self->length);
5376
Tim Peters7a29bd52001-09-12 03:03:31 +00005377 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 /* fixfct should return TRUE if it modified the buffer. If
5379 FALSE, return a reference to the original buffer instead
5380 (to save space, not time) */
5381 Py_INCREF(self);
5382 Py_DECREF(u);
5383 return (PyObject*) self;
5384 }
5385 return (PyObject*) u;
5386}
5387
Tim Petersced69f82003-09-16 20:30:58 +00005388static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389int fixupper(PyUnicodeObject *self)
5390{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005391 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 Py_UNICODE *s = self->str;
5393 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005394
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 while (len-- > 0) {
5396 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005397
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 ch = Py_UNICODE_TOUPPER(*s);
5399 if (ch != *s) {
5400 status = 1;
5401 *s = ch;
5402 }
5403 s++;
5404 }
5405
5406 return status;
5407}
5408
Tim Petersced69f82003-09-16 20:30:58 +00005409static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410int fixlower(PyUnicodeObject *self)
5411{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005412 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 Py_UNICODE *s = self->str;
5414 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005415
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 while (len-- > 0) {
5417 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005418
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 ch = Py_UNICODE_TOLOWER(*s);
5420 if (ch != *s) {
5421 status = 1;
5422 *s = ch;
5423 }
5424 s++;
5425 }
5426
5427 return status;
5428}
5429
Tim Petersced69f82003-09-16 20:30:58 +00005430static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431int fixswapcase(PyUnicodeObject *self)
5432{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005433 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 Py_UNICODE *s = self->str;
5435 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 while (len-- > 0) {
5438 if (Py_UNICODE_ISUPPER(*s)) {
5439 *s = Py_UNICODE_TOLOWER(*s);
5440 status = 1;
5441 } else if (Py_UNICODE_ISLOWER(*s)) {
5442 *s = Py_UNICODE_TOUPPER(*s);
5443 status = 1;
5444 }
5445 s++;
5446 }
5447
5448 return status;
5449}
5450
Tim Petersced69f82003-09-16 20:30:58 +00005451static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452int fixcapitalize(PyUnicodeObject *self)
5453{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005454 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005455 Py_UNICODE *s = self->str;
5456 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005457
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005458 if (len == 0)
5459 return 0;
5460 if (Py_UNICODE_ISLOWER(*s)) {
5461 *s = Py_UNICODE_TOUPPER(*s);
5462 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005464 s++;
5465 while (--len > 0) {
5466 if (Py_UNICODE_ISUPPER(*s)) {
5467 *s = Py_UNICODE_TOLOWER(*s);
5468 status = 1;
5469 }
5470 s++;
5471 }
5472 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473}
5474
5475static
5476int fixtitle(PyUnicodeObject *self)
5477{
5478 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5479 register Py_UNICODE *e;
5480 int previous_is_cased;
5481
5482 /* Shortcut for single character strings */
5483 if (PyUnicode_GET_SIZE(self) == 1) {
5484 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5485 if (*p != ch) {
5486 *p = ch;
5487 return 1;
5488 }
5489 else
5490 return 0;
5491 }
Tim Petersced69f82003-09-16 20:30:58 +00005492
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 e = p + PyUnicode_GET_SIZE(self);
5494 previous_is_cased = 0;
5495 for (; p < e; p++) {
5496 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005497
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 if (previous_is_cased)
5499 *p = Py_UNICODE_TOLOWER(ch);
5500 else
5501 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005502
5503 if (Py_UNICODE_ISLOWER(ch) ||
5504 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 Py_UNICODE_ISTITLE(ch))
5506 previous_is_cased = 1;
5507 else
5508 previous_is_cased = 0;
5509 }
5510 return 1;
5511}
5512
Tim Peters8ce9f162004-08-27 01:49:32 +00005513PyObject *
5514PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515{
Tim Peters8ce9f162004-08-27 01:49:32 +00005516 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005517 const Py_UNICODE blank = ' ';
5518 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005519 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005520 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005521 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5522 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005523 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5524 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005525 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005526 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005527 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528
Tim Peters05eba1f2004-08-27 21:32:02 +00005529 fseq = PySequence_Fast(seq, "");
5530 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005531 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005532 }
5533
Tim Peters91879ab2004-08-27 22:35:44 +00005534 /* Grrrr. A codec may be invoked to convert str objects to
5535 * Unicode, and so it's possible to call back into Python code
5536 * during PyUnicode_FromObject(), and so it's possible for a sick
5537 * codec to change the size of fseq (if seq is a list). Therefore
5538 * we have to keep refetching the size -- can't assume seqlen
5539 * is invariant.
5540 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005541 seqlen = PySequence_Fast_GET_SIZE(fseq);
5542 /* If empty sequence, return u"". */
5543 if (seqlen == 0) {
5544 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5545 goto Done;
5546 }
5547 /* If singleton sequence with an exact Unicode, return that. */
5548 if (seqlen == 1) {
5549 item = PySequence_Fast_GET_ITEM(fseq, 0);
5550 if (PyUnicode_CheckExact(item)) {
5551 Py_INCREF(item);
5552 res = (PyUnicodeObject *)item;
5553 goto Done;
5554 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005555 }
5556
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 /* At least two items to join, or one that isn't exact Unicode. */
5558 if (seqlen > 1) {
5559 /* Set up sep and seplen -- they're needed. */
5560 if (separator == NULL) {
5561 sep = &blank;
5562 seplen = 1;
5563 }
5564 else {
5565 internal_separator = PyUnicode_FromObject(separator);
5566 if (internal_separator == NULL)
5567 goto onError;
5568 sep = PyUnicode_AS_UNICODE(internal_separator);
5569 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005570 /* In case PyUnicode_FromObject() mutated seq. */
5571 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005572 }
5573 }
5574
5575 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005576 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005577 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005578 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005579 res_p = PyUnicode_AS_UNICODE(res);
5580 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005581
Tim Peters05eba1f2004-08-27 21:32:02 +00005582 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005583 Py_ssize_t itemlen;
5584 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005585
5586 item = PySequence_Fast_GET_ITEM(fseq, i);
5587 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005588 if (!PyUnicode_Check(item)) {
5589 PyErr_Format(PyExc_TypeError,
5590 "sequence item %zd: expected str instance,"
5591 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005592 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005593 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005594 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005595 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005596 if (item == NULL)
5597 goto onError;
5598 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005599
Tim Peters91879ab2004-08-27 22:35:44 +00005600 /* In case PyUnicode_FromObject() mutated seq. */
5601 seqlen = PySequence_Fast_GET_SIZE(fseq);
5602
Tim Peters8ce9f162004-08-27 01:49:32 +00005603 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005605 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005606 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005607 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005608 if (i < seqlen - 1) {
5609 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005610 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005611 goto Overflow;
5612 }
5613 if (new_res_used > res_alloc) {
5614 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005615 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005616 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005617 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005618 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005619 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005620 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005621 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005623 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005624 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005626
5627 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005628 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005629 res_p += itemlen;
5630 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005631 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005632 res_p += seplen;
5633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005635 res_used = new_res_used;
5636 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005637
Tim Peters05eba1f2004-08-27 21:32:02 +00005638 /* Shrink res to match the used area; this probably can't fail,
5639 * but it's cheap to check.
5640 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005641 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005642 goto onError;
5643
5644 Done:
5645 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005646 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 return (PyObject *)res;
5648
Tim Peters8ce9f162004-08-27 01:49:32 +00005649 Overflow:
5650 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005651 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005652 Py_DECREF(item);
5653 /* fall through */
5654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005656 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005657 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005658 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 return NULL;
5660}
5661
Tim Petersced69f82003-09-16 20:30:58 +00005662static
5663PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005664 Py_ssize_t left,
5665 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 Py_UNICODE fill)
5667{
5668 PyUnicodeObject *u;
5669
5670 if (left < 0)
5671 left = 0;
5672 if (right < 0)
5673 right = 0;
5674
Tim Peters7a29bd52001-09-12 03:03:31 +00005675 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 Py_INCREF(self);
5677 return self;
5678 }
5679
5680 u = _PyUnicode_New(left + self->length + right);
5681 if (u) {
5682 if (left)
5683 Py_UNICODE_FILL(u->str, fill, left);
5684 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5685 if (right)
5686 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5687 }
5688
5689 return u;
5690}
5691
5692#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005693 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 if (!str) \
5695 goto onError; \
5696 if (PyList_Append(list, str)) { \
5697 Py_DECREF(str); \
5698 goto onError; \
5699 } \
5700 else \
5701 Py_DECREF(str);
5702
5703static
5704PyObject *split_whitespace(PyUnicodeObject *self,
5705 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005706 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005708 register Py_ssize_t i;
5709 register Py_ssize_t j;
5710 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005712 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713
5714 for (i = j = 0; i < len; ) {
5715 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005716 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 i++;
5718 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005719 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 i++;
5721 if (j < i) {
5722 if (maxcount-- <= 0)
5723 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005724 SPLIT_APPEND(buf, j, i);
5725 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 i++;
5727 j = i;
5728 }
5729 }
5730 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005731 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 }
5733 return list;
5734
5735 onError:
5736 Py_DECREF(list);
5737 return NULL;
5738}
5739
5740PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005741 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005743 register Py_ssize_t i;
5744 register Py_ssize_t j;
5745 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 PyObject *list;
5747 PyObject *str;
5748 Py_UNICODE *data;
5749
5750 string = PyUnicode_FromObject(string);
5751 if (string == NULL)
5752 return NULL;
5753 data = PyUnicode_AS_UNICODE(string);
5754 len = PyUnicode_GET_SIZE(string);
5755
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 list = PyList_New(0);
5757 if (!list)
5758 goto onError;
5759
5760 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005761 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005762
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005764 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
5767 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005768 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 if (i < len) {
5770 if (data[i] == '\r' && i + 1 < len &&
5771 data[i+1] == '\n')
5772 i += 2;
5773 else
5774 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005775 if (keepends)
5776 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 }
Guido van Rossum86662912000-04-11 15:38:46 +00005778 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 j = i;
5780 }
5781 if (j < len) {
5782 SPLIT_APPEND(data, j, len);
5783 }
5784
5785 Py_DECREF(string);
5786 return list;
5787
5788 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005789 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 Py_DECREF(string);
5791 return NULL;
5792}
5793
Tim Petersced69f82003-09-16 20:30:58 +00005794static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795PyObject *split_char(PyUnicodeObject *self,
5796 PyObject *list,
5797 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005800 register Py_ssize_t i;
5801 register Py_ssize_t j;
5802 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005804 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805
5806 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005807 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 if (maxcount-- <= 0)
5809 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005810 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 i = j = i + 1;
5812 } else
5813 i++;
5814 }
5815 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005816 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 }
5818 return list;
5819
5820 onError:
5821 Py_DECREF(list);
5822 return NULL;
5823}
5824
Tim Petersced69f82003-09-16 20:30:58 +00005825static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826PyObject *split_substring(PyUnicodeObject *self,
5827 PyObject *list,
5828 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005829 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 register Py_ssize_t i;
5832 register Py_ssize_t j;
5833 Py_ssize_t len = self->length;
5834 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 PyObject *str;
5836
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005837 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 if (Py_UNICODE_MATCH(self, i, substring)) {
5839 if (maxcount-- <= 0)
5840 break;
5841 SPLIT_APPEND(self->str, j, i);
5842 i = j = i + sublen;
5843 } else
5844 i++;
5845 }
5846 if (j <= len) {
5847 SPLIT_APPEND(self->str, j, len);
5848 }
5849 return list;
5850
5851 onError:
5852 Py_DECREF(list);
5853 return NULL;
5854}
5855
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005856static
5857PyObject *rsplit_whitespace(PyUnicodeObject *self,
5858 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005859 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005861 register Py_ssize_t i;
5862 register Py_ssize_t j;
5863 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005864 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005865 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005866
5867 for (i = j = len - 1; i >= 0; ) {
5868 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005869 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005870 i--;
5871 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005872 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005873 i--;
5874 if (j > i) {
5875 if (maxcount-- <= 0)
5876 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005877 SPLIT_APPEND(buf, i + 1, j + 1);
5878 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005879 i--;
5880 j = i;
5881 }
5882 }
5883 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005884 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005885 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005886 if (PyList_Reverse(list) < 0)
5887 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005888 return list;
5889
5890 onError:
5891 Py_DECREF(list);
5892 return NULL;
5893}
5894
5895static
5896PyObject *rsplit_char(PyUnicodeObject *self,
5897 PyObject *list,
5898 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005899 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005901 register Py_ssize_t i;
5902 register Py_ssize_t j;
5903 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005904 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005905 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005906
5907 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005908 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005909 if (maxcount-- <= 0)
5910 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005911 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005912 j = i = i - 1;
5913 } else
5914 i--;
5915 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005916 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005917 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005918 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005919 if (PyList_Reverse(list) < 0)
5920 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005921 return list;
5922
5923 onError:
5924 Py_DECREF(list);
5925 return NULL;
5926}
5927
5928static
5929PyObject *rsplit_substring(PyUnicodeObject *self,
5930 PyObject *list,
5931 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005932 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005933{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005934 register Py_ssize_t i;
5935 register Py_ssize_t j;
5936 Py_ssize_t len = self->length;
5937 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005938 PyObject *str;
5939
5940 for (i = len - sublen, j = len; i >= 0; ) {
5941 if (Py_UNICODE_MATCH(self, i, substring)) {
5942 if (maxcount-- <= 0)
5943 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005944 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005945 j = i;
5946 i -= sublen;
5947 } else
5948 i--;
5949 }
5950 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005951 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005952 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005953 if (PyList_Reverse(list) < 0)
5954 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005955 return list;
5956
5957 onError:
5958 Py_DECREF(list);
5959 return NULL;
5960}
5961
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962#undef SPLIT_APPEND
5963
5964static
5965PyObject *split(PyUnicodeObject *self,
5966 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005967 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968{
5969 PyObject *list;
5970
5971 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005972 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
5974 list = PyList_New(0);
5975 if (!list)
5976 return NULL;
5977
5978 if (substring == NULL)
5979 return split_whitespace(self,list,maxcount);
5980
5981 else if (substring->length == 1)
5982 return split_char(self,list,substring->str[0],maxcount);
5983
5984 else if (substring->length == 0) {
5985 Py_DECREF(list);
5986 PyErr_SetString(PyExc_ValueError, "empty separator");
5987 return NULL;
5988 }
5989 else
5990 return split_substring(self,list,substring,maxcount);
5991}
5992
Tim Petersced69f82003-09-16 20:30:58 +00005993static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005994PyObject *rsplit(PyUnicodeObject *self,
5995 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005996 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005997{
5998 PyObject *list;
5999
6000 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006001 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006002
6003 list = PyList_New(0);
6004 if (!list)
6005 return NULL;
6006
6007 if (substring == NULL)
6008 return rsplit_whitespace(self,list,maxcount);
6009
6010 else if (substring->length == 1)
6011 return rsplit_char(self,list,substring->str[0],maxcount);
6012
6013 else if (substring->length == 0) {
6014 Py_DECREF(list);
6015 PyErr_SetString(PyExc_ValueError, "empty separator");
6016 return NULL;
6017 }
6018 else
6019 return rsplit_substring(self,list,substring,maxcount);
6020}
6021
6022static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023PyObject *replace(PyUnicodeObject *self,
6024 PyUnicodeObject *str1,
6025 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006026 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027{
6028 PyUnicodeObject *u;
6029
6030 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006031 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Thomas Wouters477c8d52006-05-27 19:21:47 +00006033 if (str1->length == str2->length) {
6034 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006035 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006036 if (str1->length == 1) {
6037 /* replace characters */
6038 Py_UNICODE u1, u2;
6039 if (!findchar(self->str, self->length, str1->str[0]))
6040 goto nothing;
6041 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6042 if (!u)
6043 return NULL;
6044 Py_UNICODE_COPY(u->str, self->str, self->length);
6045 u1 = str1->str[0];
6046 u2 = str2->str[0];
6047 for (i = 0; i < u->length; i++)
6048 if (u->str[i] == u1) {
6049 if (--maxcount < 0)
6050 break;
6051 u->str[i] = u2;
6052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006054 i = fastsearch(
6055 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006057 if (i < 0)
6058 goto nothing;
6059 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6060 if (!u)
6061 return NULL;
6062 Py_UNICODE_COPY(u->str, self->str, self->length);
6063 while (i <= self->length - str1->length)
6064 if (Py_UNICODE_MATCH(self, i, str1)) {
6065 if (--maxcount < 0)
6066 break;
6067 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6068 i += str1->length;
6069 } else
6070 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006073
6074 Py_ssize_t n, i, j, e;
6075 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 Py_UNICODE *p;
6077
6078 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006079 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 if (n > maxcount)
6081 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006082 if (n == 0)
6083 goto nothing;
6084 /* new_size = self->length + n * (str2->length - str1->length)); */
6085 delta = (str2->length - str1->length);
6086 if (delta == 0) {
6087 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006089 product = n * (str2->length - str1->length);
6090 if ((product / (str2->length - str1->length)) != n) {
6091 PyErr_SetString(PyExc_OverflowError,
6092 "replace string is too long");
6093 return NULL;
6094 }
6095 new_size = self->length + product;
6096 if (new_size < 0) {
6097 PyErr_SetString(PyExc_OverflowError,
6098 "replace string is too long");
6099 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 }
6101 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006102 u = _PyUnicode_New(new_size);
6103 if (!u)
6104 return NULL;
6105 i = 0;
6106 p = u->str;
6107 e = self->length - str1->length;
6108 if (str1->length > 0) {
6109 while (n-- > 0) {
6110 /* look for next match */
6111 j = i;
6112 while (j <= e) {
6113 if (Py_UNICODE_MATCH(self, j, str1))
6114 break;
6115 j++;
6116 }
6117 if (j > i) {
6118 if (j > e)
6119 break;
6120 /* copy unchanged part [i:j] */
6121 Py_UNICODE_COPY(p, self->str+i, j-i);
6122 p += j - i;
6123 }
6124 /* copy substitution string */
6125 if (str2->length > 0) {
6126 Py_UNICODE_COPY(p, str2->str, str2->length);
6127 p += str2->length;
6128 }
6129 i = j + str1->length;
6130 }
6131 if (i < self->length)
6132 /* copy tail [i:] */
6133 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6134 } else {
6135 /* interleave */
6136 while (n > 0) {
6137 Py_UNICODE_COPY(p, str2->str, str2->length);
6138 p += str2->length;
6139 if (--n <= 0)
6140 break;
6141 *p++ = self->str[i++];
6142 }
6143 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006147
6148nothing:
6149 /* nothing to replace; return original string (when possible) */
6150 if (PyUnicode_CheckExact(self)) {
6151 Py_INCREF(self);
6152 return (PyObject *) self;
6153 }
6154 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155}
6156
6157/* --- Unicode Object Methods --------------------------------------------- */
6158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006159PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160"S.title() -> unicode\n\
6161\n\
6162Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006163characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
6165static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006166unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 return fixup(self, fixtitle);
6169}
6170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006171PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172"S.capitalize() -> unicode\n\
6173\n\
6174Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006175have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176
6177static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006178unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 return fixup(self, fixcapitalize);
6181}
6182
6183#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006184PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185"S.capwords() -> unicode\n\
6186\n\
6187Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006188normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
6190static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006191unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192{
6193 PyObject *list;
6194 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006195 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 /* Split into words */
6198 list = split(self, NULL, -1);
6199 if (!list)
6200 return NULL;
6201
6202 /* Capitalize each word */
6203 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6204 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6205 fixcapitalize);
6206 if (item == NULL)
6207 goto onError;
6208 Py_DECREF(PyList_GET_ITEM(list, i));
6209 PyList_SET_ITEM(list, i, item);
6210 }
6211
6212 /* Join the words to form a new string */
6213 item = PyUnicode_Join(NULL, list);
6214
6215onError:
6216 Py_DECREF(list);
6217 return (PyObject *)item;
6218}
6219#endif
6220
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006221/* Argument converter. Coerces to a single unicode character */
6222
6223static int
6224convert_uc(PyObject *obj, void *addr)
6225{
6226 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6227 PyObject *uniobj;
6228 Py_UNICODE *unistr;
6229
6230 uniobj = PyUnicode_FromObject(obj);
6231 if (uniobj == NULL) {
6232 PyErr_SetString(PyExc_TypeError,
6233 "The fill character cannot be converted to Unicode");
6234 return 0;
6235 }
6236 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6237 PyErr_SetString(PyExc_TypeError,
6238 "The fill character must be exactly one character long");
6239 Py_DECREF(uniobj);
6240 return 0;
6241 }
6242 unistr = PyUnicode_AS_UNICODE(uniobj);
6243 *fillcharloc = unistr[0];
6244 Py_DECREF(uniobj);
6245 return 1;
6246}
6247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006248PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006249"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006251Return S centered in a Unicode string of length width. Padding is\n\
6252done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253
6254static PyObject *
6255unicode_center(PyUnicodeObject *self, PyObject *args)
6256{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006257 Py_ssize_t marg, left;
6258 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006259 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
Thomas Woutersde017742006-02-16 19:34:37 +00006261 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 return NULL;
6263
Tim Peters7a29bd52001-09-12 03:03:31 +00006264 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 Py_INCREF(self);
6266 return (PyObject*) self;
6267 }
6268
6269 marg = width - self->length;
6270 left = marg / 2 + (marg & width & 1);
6271
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006272 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273}
6274
Marc-André Lemburge5034372000-08-08 08:04:29 +00006275#if 0
6276
6277/* This code should go into some future Unicode collation support
6278 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006279 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006280
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006281/* speedy UTF-16 code point order comparison */
6282/* gleaned from: */
6283/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6284
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006285static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006286{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006287 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006288 0, 0, 0, 0, 0, 0, 0, 0,
6289 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006290 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006291};
6292
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293static int
6294unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6295{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006296 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006297
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 Py_UNICODE *s1 = str1->str;
6299 Py_UNICODE *s2 = str2->str;
6300
6301 len1 = str1->length;
6302 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006303
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006305 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006306
6307 c1 = *s1++;
6308 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006309
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006310 if (c1 > (1<<11) * 26)
6311 c1 += utf16Fixup[c1>>11];
6312 if (c2 > (1<<11) * 26)
6313 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006314 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006315
6316 if (c1 != c2)
6317 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006318
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006319 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 }
6321
6322 return (len1 < len2) ? -1 : (len1 != len2);
6323}
6324
Marc-André Lemburge5034372000-08-08 08:04:29 +00006325#else
6326
6327static int
6328unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006330 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006331
6332 Py_UNICODE *s1 = str1->str;
6333 Py_UNICODE *s2 = str2->str;
6334
6335 len1 = str1->length;
6336 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006337
Marc-André Lemburge5034372000-08-08 08:04:29 +00006338 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006339 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006340
Fredrik Lundh45714e92001-06-26 16:39:36 +00006341 c1 = *s1++;
6342 c2 = *s2++;
6343
6344 if (c1 != c2)
6345 return (c1 < c2) ? -1 : 1;
6346
Marc-André Lemburge5034372000-08-08 08:04:29 +00006347 len1--; len2--;
6348 }
6349
6350 return (len1 < len2) ? -1 : (len1 != len2);
6351}
6352
6353#endif
6354
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355int PyUnicode_Compare(PyObject *left,
6356 PyObject *right)
6357{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006358 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6359 return unicode_compare((PyUnicodeObject *)left,
6360 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006361 PyErr_Format(PyExc_TypeError,
6362 "Can't compare %.100s and %.100s",
6363 left->ob_type->tp_name,
6364 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 return -1;
6366}
6367
Martin v. Löwis5b222132007-06-10 09:51:05 +00006368int
6369PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6370{
6371 int i;
6372 Py_UNICODE *id;
6373 assert(PyUnicode_Check(uni));
6374 id = PyUnicode_AS_UNICODE(uni);
6375 /* Compare Unicode string and source character set string */
6376 for (i = 0; id[i] && str[i]; i++)
6377 if (id[i] != str[i])
6378 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6379 if (id[i])
6380 return 1; /* uni is longer */
6381 if (str[i])
6382 return -1; /* str is longer */
6383 return 0;
6384}
6385
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006386PyObject *PyUnicode_RichCompare(PyObject *left,
6387 PyObject *right,
6388 int op)
6389{
6390 int result;
6391
6392 result = PyUnicode_Compare(left, right);
6393 if (result == -1 && PyErr_Occurred())
6394 goto onError;
6395
6396 /* Convert the return value to a Boolean */
6397 switch (op) {
6398 case Py_EQ:
6399 result = (result == 0);
6400 break;
6401 case Py_NE:
6402 result = (result != 0);
6403 break;
6404 case Py_LE:
6405 result = (result <= 0);
6406 break;
6407 case Py_GE:
6408 result = (result >= 0);
6409 break;
6410 case Py_LT:
6411 result = (result == -1);
6412 break;
6413 case Py_GT:
6414 result = (result == 1);
6415 break;
6416 }
6417 return PyBool_FromLong(result);
6418
6419 onError:
6420
6421 /* Standard case
6422
6423 Type errors mean that PyUnicode_FromObject() could not convert
6424 one of the arguments (usually the right hand side) to Unicode,
6425 ie. we can't handle the comparison request. However, it is
6426 possible that the other object knows a comparison method, which
6427 is why we return Py_NotImplemented to give the other object a
6428 chance.
6429
6430 */
6431 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6432 PyErr_Clear();
6433 Py_INCREF(Py_NotImplemented);
6434 return Py_NotImplemented;
6435 }
6436 if (op != Py_EQ && op != Py_NE)
6437 return NULL;
6438
6439 /* Equality comparison.
6440
6441 This is a special case: we silence any PyExc_UnicodeDecodeError
6442 and instead turn it into a PyErr_UnicodeWarning.
6443
6444 */
6445 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6446 return NULL;
6447 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006448 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6449 (op == Py_EQ) ?
6450 "Unicode equal comparison "
6451 "failed to convert both arguments to Unicode - "
6452 "interpreting them as being unequal"
6453 :
6454 "Unicode unequal comparison "
6455 "failed to convert both arguments to Unicode - "
6456 "interpreting them as being unequal",
6457 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006458 return NULL;
6459 result = (op == Py_NE);
6460 return PyBool_FromLong(result);
6461}
6462
Guido van Rossum403d68b2000-03-13 15:55:09 +00006463int PyUnicode_Contains(PyObject *container,
6464 PyObject *element)
6465{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006466 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006467 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006468
6469 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006470 sub = PyUnicode_FromObject(element);
6471 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006472 PyErr_Format(PyExc_TypeError,
6473 "'in <string>' requires string as left operand, not %s",
6474 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006475 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006476 }
6477
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478 str = PyUnicode_FromObject(container);
6479 if (!str) {
6480 Py_DECREF(sub);
6481 return -1;
6482 }
6483
6484 result = stringlib_contains_obj(str, sub);
6485
6486 Py_DECREF(str);
6487 Py_DECREF(sub);
6488
Guido van Rossum403d68b2000-03-13 15:55:09 +00006489 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006490}
6491
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492/* Concat to string or Unicode object giving a new Unicode object. */
6493
6494PyObject *PyUnicode_Concat(PyObject *left,
6495 PyObject *right)
6496{
6497 PyUnicodeObject *u = NULL, *v = NULL, *w;
6498
6499 /* Coerce the two arguments */
6500 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6501 if (u == NULL)
6502 goto onError;
6503 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6504 if (v == NULL)
6505 goto onError;
6506
6507 /* Shortcuts */
6508 if (v == unicode_empty) {
6509 Py_DECREF(v);
6510 return (PyObject *)u;
6511 }
6512 if (u == unicode_empty) {
6513 Py_DECREF(u);
6514 return (PyObject *)v;
6515 }
6516
6517 /* Concat the two Unicode strings */
6518 w = _PyUnicode_New(u->length + v->length);
6519 if (w == NULL)
6520 goto onError;
6521 Py_UNICODE_COPY(w->str, u->str, u->length);
6522 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6523
6524 Py_DECREF(u);
6525 Py_DECREF(v);
6526 return (PyObject *)w;
6527
6528onError:
6529 Py_XDECREF(u);
6530 Py_XDECREF(v);
6531 return NULL;
6532}
6533
Walter Dörwald1ab83302007-05-18 17:15:44 +00006534void
6535PyUnicode_Append(PyObject **pleft, PyObject *right)
6536{
6537 PyObject *new;
6538 if (*pleft == NULL)
6539 return;
6540 if (right == NULL || !PyUnicode_Check(*pleft)) {
6541 Py_DECREF(*pleft);
6542 *pleft = NULL;
6543 return;
6544 }
6545 new = PyUnicode_Concat(*pleft, right);
6546 Py_DECREF(*pleft);
6547 *pleft = new;
6548}
6549
6550void
6551PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6552{
6553 PyUnicode_Append(pleft, right);
6554 Py_XDECREF(right);
6555}
6556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006557PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558"S.count(sub[, start[, end]]) -> int\n\
6559\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006560Return the number of non-overlapping occurrences of substring sub in\n\
6561Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006562interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563
6564static PyObject *
6565unicode_count(PyUnicodeObject *self, PyObject *args)
6566{
6567 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006568 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006569 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 PyObject *result;
6571
Guido van Rossumb8872e62000-05-09 14:14:27 +00006572 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6573 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 return NULL;
6575
6576 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006577 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 if (substring == NULL)
6579 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006580
Thomas Wouters477c8d52006-05-27 19:21:47 +00006581 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
Christian Heimes217cfd12007-12-02 14:31:20 +00006583 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584 stringlib_count(self->str + start, end - start,
6585 substring->str, substring->length)
6586 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587
6588 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006589
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 return result;
6591}
6592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006593PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006594"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006596Encodes S using the codec registered for encoding. encoding defaults\n\
6597to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006598handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006599a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6600'xmlcharrefreplace' as well as any other name registered with\n\
6601codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
6603static PyObject *
6604unicode_encode(PyUnicodeObject *self, PyObject *args)
6605{
6606 char *encoding = NULL;
6607 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006608 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006609
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6611 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006612 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006613 if (v == NULL)
6614 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006615 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006616 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006617 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006618 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006619 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006620 Py_DECREF(v);
6621 return NULL;
6622 }
6623 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006624
6625 onError:
6626 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006627}
6628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006629PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630"S.expandtabs([tabsize]) -> unicode\n\
6631\n\
6632Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006633If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634
6635static PyObject*
6636unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6637{
6638 Py_UNICODE *e;
6639 Py_UNICODE *p;
6640 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006641 Py_UNICODE *qe;
6642 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 PyUnicodeObject *u;
6644 int tabsize = 8;
6645
6646 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6647 return NULL;
6648
Thomas Wouters7e474022000-07-16 12:04:32 +00006649 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006650 i = 0; /* chars up to and including most recent \n or \r */
6651 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6652 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 for (p = self->str; p < e; p++)
6654 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006655 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006656 incr = tabsize - (j % tabsize); /* cannot overflow */
6657 if (j > PY_SSIZE_T_MAX - incr)
6658 goto overflow1;
6659 j += incr;
6660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 }
6662 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006663 if (j > PY_SSIZE_T_MAX - 1)
6664 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 j++;
6666 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006667 if (i > PY_SSIZE_T_MAX - j)
6668 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006670 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 }
6672 }
6673
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006674 if (i > PY_SSIZE_T_MAX - j)
6675 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 /* Second pass: create output string and fill it */
6678 u = _PyUnicode_New(i + j);
6679 if (!u)
6680 return NULL;
6681
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006682 j = 0; /* same as in first pass */
6683 q = u->str; /* next output char */
6684 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685
6686 for (p = self->str; p < e; p++)
6687 if (*p == '\t') {
6688 if (tabsize > 0) {
6689 i = tabsize - (j % tabsize);
6690 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006691 while (i--) {
6692 if (q >= qe)
6693 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 }
6697 }
6698 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006699 if (q >= qe)
6700 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006702 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 if (*p == '\n' || *p == '\r')
6704 j = 0;
6705 }
6706
6707 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006708
6709 overflow2:
6710 Py_DECREF(u);
6711 overflow1:
6712 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006716PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717"S.find(sub [,start [,end]]) -> int\n\
6718\n\
6719Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006720such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721arguments start and end are interpreted as in slice notation.\n\
6722\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006723Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
6725static PyObject *
6726unicode_find(PyUnicodeObject *self, PyObject *args)
6727{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006728 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006729 Py_ssize_t start;
6730 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006731 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
Christian Heimes9cd17752007-11-18 19:35:23 +00006733 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735
Thomas Wouters477c8d52006-05-27 19:21:47 +00006736 result = stringlib_find_slice(
6737 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6738 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6739 start, end
6740 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741
6742 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006743
Christian Heimes217cfd12007-12-02 14:31:20 +00006744 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745}
6746
6747static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006748unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749{
6750 if (index < 0 || index >= self->length) {
6751 PyErr_SetString(PyExc_IndexError, "string index out of range");
6752 return NULL;
6753 }
6754
6755 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6756}
6757
Guido van Rossumc2504932007-09-18 19:42:40 +00006758/* Believe it or not, this produces the same value for ASCII strings
6759 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006761unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762{
Guido van Rossumc2504932007-09-18 19:42:40 +00006763 Py_ssize_t len;
6764 Py_UNICODE *p;
6765 long x;
6766
6767 if (self->hash != -1)
6768 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006769 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006770 p = self->str;
6771 x = *p << 7;
6772 while (--len >= 0)
6773 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006774 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006775 if (x == -1)
6776 x = -2;
6777 self->hash = x;
6778 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779}
6780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006781PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782"S.index(sub [,start [,end]]) -> int\n\
6783\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006784Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785
6786static PyObject *
6787unicode_index(PyUnicodeObject *self, PyObject *args)
6788{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006789 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006790 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006791 Py_ssize_t start;
6792 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
Christian Heimes9cd17752007-11-18 19:35:23 +00006794 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796
Thomas Wouters477c8d52006-05-27 19:21:47 +00006797 result = stringlib_find_slice(
6798 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6799 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6800 start, end
6801 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
6803 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006804
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 if (result < 0) {
6806 PyErr_SetString(PyExc_ValueError, "substring not found");
6807 return NULL;
6808 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006809
Christian Heimes217cfd12007-12-02 14:31:20 +00006810 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811}
6812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006813PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006814"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006816Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006817at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
6819static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006820unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
6822 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6823 register const Py_UNICODE *e;
6824 int cased;
6825
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 /* Shortcut for single character strings */
6827 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006828 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006830 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006831 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006832 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 e = p + PyUnicode_GET_SIZE(self);
6835 cased = 0;
6836 for (; p < e; p++) {
6837 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006838
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 else if (!cased && Py_UNICODE_ISLOWER(ch))
6842 cased = 1;
6843 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006844 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845}
6846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006847PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006848"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006850Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006851at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852
6853static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006854unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855{
6856 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6857 register const Py_UNICODE *e;
6858 int cased;
6859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 /* Shortcut for single character strings */
6861 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006862 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006864 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006865 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006866 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006867
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 e = p + PyUnicode_GET_SIZE(self);
6869 cased = 0;
6870 for (; p < e; p++) {
6871 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006872
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006874 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 else if (!cased && Py_UNICODE_ISUPPER(ch))
6876 cased = 1;
6877 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879}
6880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006881PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006884Return True if S is a titlecased string and there is at least one\n\
6885character in S, i.e. upper- and titlecase characters may only\n\
6886follow uncased characters and lowercase characters only cased ones.\n\
6887Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888
6889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006890unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891{
6892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6893 register const Py_UNICODE *e;
6894 int cased, previous_is_cased;
6895
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 /* Shortcut for single character strings */
6897 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6899 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006901 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006902 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006903 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006904
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 e = p + PyUnicode_GET_SIZE(self);
6906 cased = 0;
6907 previous_is_cased = 0;
6908 for (; p < e; p++) {
6909 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006910
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6912 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006913 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 previous_is_cased = 1;
6915 cased = 1;
6916 }
6917 else if (Py_UNICODE_ISLOWER(ch)) {
6918 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006919 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 previous_is_cased = 1;
6921 cased = 1;
6922 }
6923 else
6924 previous_is_cased = 0;
6925 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006926 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927}
6928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006930"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006932Return True if all characters in S are whitespace\n\
6933and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006936unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937{
6938 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6939 register const Py_UNICODE *e;
6940
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 /* Shortcut for single character strings */
6942 if (PyUnicode_GET_SIZE(self) == 1 &&
6943 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006946 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006947 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006948 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 e = p + PyUnicode_GET_SIZE(self);
6951 for (; p < e; p++) {
6952 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006953 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006955 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956}
6957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006958PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006959"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006960\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006961Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006962and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006963
6964static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006965unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006966{
6967 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6968 register const Py_UNICODE *e;
6969
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006970 /* Shortcut for single character strings */
6971 if (PyUnicode_GET_SIZE(self) == 1 &&
6972 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006973 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006974
6975 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006976 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006977 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006978
6979 e = p + PyUnicode_GET_SIZE(self);
6980 for (; p < e; p++) {
6981 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006982 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006983 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006984 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006985}
6986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006988"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006989\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006990Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006991and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006992
6993static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006994unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006995{
6996 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6997 register const Py_UNICODE *e;
6998
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006999 /* Shortcut for single character strings */
7000 if (PyUnicode_GET_SIZE(self) == 1 &&
7001 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007002 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007003
7004 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007005 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007006 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007007
7008 e = p + PyUnicode_GET_SIZE(self);
7009 for (; p < e; p++) {
7010 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007011 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007012 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007013 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007014}
7015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007016PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007017"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007019Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007020False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021
7022static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007023unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024{
7025 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7026 register const Py_UNICODE *e;
7027
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 /* Shortcut for single character strings */
7029 if (PyUnicode_GET_SIZE(self) == 1 &&
7030 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007031 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007033 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007034 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007035 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007036
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 e = p + PyUnicode_GET_SIZE(self);
7038 for (; p < e; p++) {
7039 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007040 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007042 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043}
7044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007045PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007046"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007048Return True if all characters in S are digits\n\
7049and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050
7051static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007052unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053{
7054 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7055 register const Py_UNICODE *e;
7056
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 /* Shortcut for single character strings */
7058 if (PyUnicode_GET_SIZE(self) == 1 &&
7059 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007060 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007062 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007063 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007064 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007065
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 e = p + PyUnicode_GET_SIZE(self);
7067 for (; p < e; p++) {
7068 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007069 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007071 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072}
7073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007074PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007075"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007077Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007078False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079
7080static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007081unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082{
7083 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7084 register const Py_UNICODE *e;
7085
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086 /* Shortcut for single character strings */
7087 if (PyUnicode_GET_SIZE(self) == 1 &&
7088 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007089 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007091 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007092 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007093 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007094
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 e = p + PyUnicode_GET_SIZE(self);
7096 for (; p < e; p++) {
7097 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007098 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007100 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101}
7102
Martin v. Löwis47383402007-08-15 07:32:56 +00007103int
7104PyUnicode_IsIdentifier(PyObject *self)
7105{
7106 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7107 register const Py_UNICODE *e;
7108
7109 /* Special case for empty strings */
7110 if (PyUnicode_GET_SIZE(self) == 0)
7111 return 0;
7112
7113 /* PEP 3131 says that the first character must be in
7114 XID_Start and subsequent characters in XID_Continue,
7115 and for the ASCII range, the 2.x rules apply (i.e
7116 start with letters and underscore, continue with
7117 letters, digits, underscore). However, given the current
7118 definition of XID_Start and XID_Continue, it is sufficient
7119 to check just for these, except that _ must be allowed
7120 as starting an identifier. */
7121 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7122 return 0;
7123
7124 e = p + PyUnicode_GET_SIZE(self);
7125 for (p++; p < e; p++) {
7126 if (!_PyUnicode_IsXidContinue(*p))
7127 return 0;
7128 }
7129 return 1;
7130}
7131
7132PyDoc_STRVAR(isidentifier__doc__,
7133"S.isidentifier() -> bool\n\
7134\n\
7135Return True if S is a valid identifier according\n\
7136to the language definition.");
7137
7138static PyObject*
7139unicode_isidentifier(PyObject *self)
7140{
7141 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7142}
7143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145"S.join(sequence) -> unicode\n\
7146\n\
7147Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007151unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007153 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154}
7155
Martin v. Löwis18e16552006-02-15 17:27:45 +00007156static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157unicode_length(PyUnicodeObject *self)
7158{
7159 return self->length;
7160}
7161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007162PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007163"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164\n\
7165Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007166done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167
7168static PyObject *
7169unicode_ljust(PyUnicodeObject *self, PyObject *args)
7170{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007171 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007172 Py_UNICODE fillchar = ' ';
7173
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007174 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 return NULL;
7176
Tim Peters7a29bd52001-09-12 03:03:31 +00007177 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 Py_INCREF(self);
7179 return (PyObject*) self;
7180 }
7181
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007182 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183}
7184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007185PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186"S.lower() -> unicode\n\
7187\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007188Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189
7190static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007191unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 return fixup(self, fixlower);
7194}
7195
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007196#define LEFTSTRIP 0
7197#define RIGHTSTRIP 1
7198#define BOTHSTRIP 2
7199
7200/* Arrays indexed by above */
7201static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7202
7203#define STRIPNAME(i) (stripformat[i]+3)
7204
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007205/* externally visible for str.strip(unicode) */
7206PyObject *
7207_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7208{
7209 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007210 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007211 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007212 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7213 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007214
Thomas Wouters477c8d52006-05-27 19:21:47 +00007215 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7216
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007217 i = 0;
7218 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007219 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7220 i++;
7221 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007222 }
7223
7224 j = len;
7225 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007226 do {
7227 j--;
7228 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7229 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007230 }
7231
7232 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007233 Py_INCREF(self);
7234 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007235 }
7236 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007237 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007238}
7239
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240
7241static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007242do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007244 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007245 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007246
7247 i = 0;
7248 if (striptype != RIGHTSTRIP) {
7249 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7250 i++;
7251 }
7252 }
7253
7254 j = len;
7255 if (striptype != LEFTSTRIP) {
7256 do {
7257 j--;
7258 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7259 j++;
7260 }
7261
7262 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7263 Py_INCREF(self);
7264 return (PyObject*)self;
7265 }
7266 else
7267 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268}
7269
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007270
7271static PyObject *
7272do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7273{
7274 PyObject *sep = NULL;
7275
7276 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7277 return NULL;
7278
7279 if (sep != NULL && sep != Py_None) {
7280 if (PyUnicode_Check(sep))
7281 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007282 else {
7283 PyErr_Format(PyExc_TypeError,
7284 "%s arg must be None, unicode or str",
7285 STRIPNAME(striptype));
7286 return NULL;
7287 }
7288 }
7289
7290 return do_strip(self, striptype);
7291}
7292
7293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007294PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007295"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007296\n\
7297Return a copy of the string S with leading and trailing\n\
7298whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007299If chars is given and not None, remove characters in chars instead.\n\
7300If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007301
7302static PyObject *
7303unicode_strip(PyUnicodeObject *self, PyObject *args)
7304{
7305 if (PyTuple_GET_SIZE(args) == 0)
7306 return do_strip(self, BOTHSTRIP); /* Common case */
7307 else
7308 return do_argstrip(self, BOTHSTRIP, args);
7309}
7310
7311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007312PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007313"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007314\n\
7315Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007316If chars is given and not None, remove characters in chars instead.\n\
7317If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007318
7319static PyObject *
7320unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7321{
7322 if (PyTuple_GET_SIZE(args) == 0)
7323 return do_strip(self, LEFTSTRIP); /* Common case */
7324 else
7325 return do_argstrip(self, LEFTSTRIP, args);
7326}
7327
7328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007329PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007330"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007331\n\
7332Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007333If chars is given and not None, remove characters in chars instead.\n\
7334If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007335
7336static PyObject *
7337unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7338{
7339 if (PyTuple_GET_SIZE(args) == 0)
7340 return do_strip(self, RIGHTSTRIP); /* Common case */
7341 else
7342 return do_argstrip(self, RIGHTSTRIP, args);
7343}
7344
7345
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007347unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348{
7349 PyUnicodeObject *u;
7350 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007351 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007352 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353
7354 if (len < 0)
7355 len = 0;
7356
Tim Peters7a29bd52001-09-12 03:03:31 +00007357 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 /* no repeat, return original string */
7359 Py_INCREF(str);
7360 return (PyObject*) str;
7361 }
Tim Peters8f422462000-09-09 06:13:41 +00007362
7363 /* ensure # of chars needed doesn't overflow int and # of bytes
7364 * needed doesn't overflow size_t
7365 */
7366 nchars = len * str->length;
7367 if (len && nchars / len != str->length) {
7368 PyErr_SetString(PyExc_OverflowError,
7369 "repeated string is too long");
7370 return NULL;
7371 }
7372 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7373 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7374 PyErr_SetString(PyExc_OverflowError,
7375 "repeated string is too long");
7376 return NULL;
7377 }
7378 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 if (!u)
7380 return NULL;
7381
7382 p = u->str;
7383
Thomas Wouters477c8d52006-05-27 19:21:47 +00007384 if (str->length == 1 && len > 0) {
7385 Py_UNICODE_FILL(p, str->str[0], len);
7386 } else {
7387 Py_ssize_t done = 0; /* number of characters copied this far */
7388 if (done < nchars) {
7389 Py_UNICODE_COPY(p, str->str, str->length);
7390 done = str->length;
7391 }
7392 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007393 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007394 Py_UNICODE_COPY(p+done, p, n);
7395 done += n;
7396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 }
7398
7399 return (PyObject*) u;
7400}
7401
7402PyObject *PyUnicode_Replace(PyObject *obj,
7403 PyObject *subobj,
7404 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007405 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406{
7407 PyObject *self;
7408 PyObject *str1;
7409 PyObject *str2;
7410 PyObject *result;
7411
7412 self = PyUnicode_FromObject(obj);
7413 if (self == NULL)
7414 return NULL;
7415 str1 = PyUnicode_FromObject(subobj);
7416 if (str1 == NULL) {
7417 Py_DECREF(self);
7418 return NULL;
7419 }
7420 str2 = PyUnicode_FromObject(replobj);
7421 if (str2 == NULL) {
7422 Py_DECREF(self);
7423 Py_DECREF(str1);
7424 return NULL;
7425 }
Tim Petersced69f82003-09-16 20:30:58 +00007426 result = replace((PyUnicodeObject *)self,
7427 (PyUnicodeObject *)str1,
7428 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 maxcount);
7430 Py_DECREF(self);
7431 Py_DECREF(str1);
7432 Py_DECREF(str2);
7433 return result;
7434}
7435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007436PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437"S.replace (old, new[, maxsplit]) -> unicode\n\
7438\n\
7439Return a copy of S with all occurrences of substring\n\
7440old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
7443static PyObject*
7444unicode_replace(PyUnicodeObject *self, PyObject *args)
7445{
7446 PyUnicodeObject *str1;
7447 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007448 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 PyObject *result;
7450
Martin v. Löwis18e16552006-02-15 17:27:45 +00007451 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452 return NULL;
7453 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7454 if (str1 == NULL)
7455 return NULL;
7456 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007457 if (str2 == NULL) {
7458 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461
7462 result = replace(self, str1, str2, maxcount);
7463
7464 Py_DECREF(str1);
7465 Py_DECREF(str2);
7466 return result;
7467}
7468
7469static
7470PyObject *unicode_repr(PyObject *unicode)
7471{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007472 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007473 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007474 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7475 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7476
7477 /* XXX(nnorwitz): rather than over-allocating, it would be
7478 better to choose a different scheme. Perhaps scan the
7479 first N-chars of the string and allocate based on that size.
7480 */
7481 /* Initial allocation is based on the longest-possible unichr
7482 escape.
7483
7484 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7485 unichr, so in this case it's the longest unichr escape. In
7486 narrow (UTF-16) builds this is five chars per source unichr
7487 since there are two unichrs in the surrogate pair, so in narrow
7488 (UTF-16) builds it's not the longest unichr escape.
7489
7490 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7491 so in the narrow (UTF-16) build case it's the longest unichr
7492 escape.
7493 */
7494
Walter Dörwald1ab83302007-05-18 17:15:44 +00007495 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007496 2 /* quotes */
7497#ifdef Py_UNICODE_WIDE
7498 + 10*size
7499#else
7500 + 6*size
7501#endif
7502 + 1);
7503 if (repr == NULL)
7504 return NULL;
7505
Walter Dörwald1ab83302007-05-18 17:15:44 +00007506 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007507
7508 /* Add quote */
7509 *p++ = (findchar(s, size, '\'') &&
7510 !findchar(s, size, '"')) ? '"' : '\'';
7511 while (size-- > 0) {
7512 Py_UNICODE ch = *s++;
7513
7514 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007515 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007516 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007517 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007518 continue;
7519 }
7520
7521#ifdef Py_UNICODE_WIDE
7522 /* Map 21-bit characters to '\U00xxxxxx' */
7523 else if (ch >= 0x10000) {
7524 *p++ = '\\';
7525 *p++ = 'U';
7526 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7527 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7528 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7529 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7530 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7531 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7532 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7533 *p++ = hexdigits[ch & 0x0000000F];
7534 continue;
7535 }
7536#else
7537 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7538 else if (ch >= 0xD800 && ch < 0xDC00) {
7539 Py_UNICODE ch2;
7540 Py_UCS4 ucs;
7541
7542 ch2 = *s++;
7543 size--;
7544 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7545 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7546 *p++ = '\\';
7547 *p++ = 'U';
7548 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7549 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7550 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7551 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7552 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7553 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7554 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7555 *p++ = hexdigits[ucs & 0x0000000F];
7556 continue;
7557 }
7558 /* Fall through: isolated surrogates are copied as-is */
7559 s--;
7560 size++;
7561 }
7562#endif
7563
7564 /* Map 16-bit characters to '\uxxxx' */
7565 if (ch >= 256) {
7566 *p++ = '\\';
7567 *p++ = 'u';
7568 *p++ = hexdigits[(ch >> 12) & 0x000F];
7569 *p++ = hexdigits[(ch >> 8) & 0x000F];
7570 *p++ = hexdigits[(ch >> 4) & 0x000F];
7571 *p++ = hexdigits[ch & 0x000F];
7572 }
7573
7574 /* Map special whitespace to '\t', \n', '\r' */
7575 else if (ch == '\t') {
7576 *p++ = '\\';
7577 *p++ = 't';
7578 }
7579 else if (ch == '\n') {
7580 *p++ = '\\';
7581 *p++ = 'n';
7582 }
7583 else if (ch == '\r') {
7584 *p++ = '\\';
7585 *p++ = 'r';
7586 }
7587
7588 /* Map non-printable US ASCII to '\xhh' */
7589 else if (ch < ' ' || ch >= 0x7F) {
7590 *p++ = '\\';
7591 *p++ = 'x';
7592 *p++ = hexdigits[(ch >> 4) & 0x000F];
7593 *p++ = hexdigits[ch & 0x000F];
7594 }
7595
7596 /* Copy everything else as-is */
7597 else
7598 *p++ = (char) ch;
7599 }
7600 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007601 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007602
7603 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007604 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007605 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606}
7607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007608PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609"S.rfind(sub [,start [,end]]) -> int\n\
7610\n\
7611Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007612such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613arguments start and end are interpreted as in slice notation.\n\
7614\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616
7617static PyObject *
7618unicode_rfind(PyUnicodeObject *self, PyObject *args)
7619{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007620 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007621 Py_ssize_t start;
7622 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007623 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
Christian Heimes9cd17752007-11-18 19:35:23 +00007625 if (!_ParseTupleFinds(args, &substring, &start, &end))
7626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
Thomas Wouters477c8d52006-05-27 19:21:47 +00007628 result = stringlib_rfind_slice(
7629 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7630 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7631 start, end
7632 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
7634 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007635
Christian Heimes217cfd12007-12-02 14:31:20 +00007636 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637}
7638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007639PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640"S.rindex(sub [,start [,end]]) -> int\n\
7641\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007642Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643
7644static PyObject *
7645unicode_rindex(PyUnicodeObject *self, PyObject *args)
7646{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007647 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007648 Py_ssize_t start;
7649 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007650 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651
Christian Heimes9cd17752007-11-18 19:35:23 +00007652 if (!_ParseTupleFinds(args, &substring, &start, &end))
7653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654
Thomas Wouters477c8d52006-05-27 19:21:47 +00007655 result = stringlib_rfind_slice(
7656 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7657 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7658 start, end
7659 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
7661 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007662
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663 if (result < 0) {
7664 PyErr_SetString(PyExc_ValueError, "substring not found");
7665 return NULL;
7666 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007667 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668}
7669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007670PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007671"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672\n\
7673Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007674done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
7676static PyObject *
7677unicode_rjust(PyUnicodeObject *self, PyObject *args)
7678{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007679 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007680 Py_UNICODE fillchar = ' ';
7681
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007682 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683 return NULL;
7684
Tim Peters7a29bd52001-09-12 03:03:31 +00007685 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 Py_INCREF(self);
7687 return (PyObject*) self;
7688 }
7689
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007690 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691}
7692
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693PyObject *PyUnicode_Split(PyObject *s,
7694 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007695 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696{
7697 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007698
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 s = PyUnicode_FromObject(s);
7700 if (s == NULL)
7701 return NULL;
7702 if (sep != NULL) {
7703 sep = PyUnicode_FromObject(sep);
7704 if (sep == NULL) {
7705 Py_DECREF(s);
7706 return NULL;
7707 }
7708 }
7709
7710 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7711
7712 Py_DECREF(s);
7713 Py_XDECREF(sep);
7714 return result;
7715}
7716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007717PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718"S.split([sep [,maxsplit]]) -> list of strings\n\
7719\n\
7720Return a list of the words in S, using sep as the\n\
7721delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007722splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007723any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724
7725static PyObject*
7726unicode_split(PyUnicodeObject *self, PyObject *args)
7727{
7728 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007729 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
Martin v. Löwis18e16552006-02-15 17:27:45 +00007731 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 return NULL;
7733
7734 if (substring == Py_None)
7735 return split(self, NULL, maxcount);
7736 else if (PyUnicode_Check(substring))
7737 return split(self, (PyUnicodeObject *)substring, maxcount);
7738 else
7739 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7740}
7741
Thomas Wouters477c8d52006-05-27 19:21:47 +00007742PyObject *
7743PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7744{
7745 PyObject* str_obj;
7746 PyObject* sep_obj;
7747 PyObject* out;
7748
7749 str_obj = PyUnicode_FromObject(str_in);
7750 if (!str_obj)
7751 return NULL;
7752 sep_obj = PyUnicode_FromObject(sep_in);
7753 if (!sep_obj) {
7754 Py_DECREF(str_obj);
7755 return NULL;
7756 }
7757
7758 out = stringlib_partition(
7759 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7760 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7761 );
7762
7763 Py_DECREF(sep_obj);
7764 Py_DECREF(str_obj);
7765
7766 return out;
7767}
7768
7769
7770PyObject *
7771PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7772{
7773 PyObject* str_obj;
7774 PyObject* sep_obj;
7775 PyObject* out;
7776
7777 str_obj = PyUnicode_FromObject(str_in);
7778 if (!str_obj)
7779 return NULL;
7780 sep_obj = PyUnicode_FromObject(sep_in);
7781 if (!sep_obj) {
7782 Py_DECREF(str_obj);
7783 return NULL;
7784 }
7785
7786 out = stringlib_rpartition(
7787 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7788 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7789 );
7790
7791 Py_DECREF(sep_obj);
7792 Py_DECREF(str_obj);
7793
7794 return out;
7795}
7796
7797PyDoc_STRVAR(partition__doc__,
7798"S.partition(sep) -> (head, sep, tail)\n\
7799\n\
7800Searches for the separator sep in S, and returns the part before it,\n\
7801the separator itself, and the part after it. If the separator is not\n\
7802found, returns S and two empty strings.");
7803
7804static PyObject*
7805unicode_partition(PyUnicodeObject *self, PyObject *separator)
7806{
7807 return PyUnicode_Partition((PyObject *)self, separator);
7808}
7809
7810PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007811"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007812\n\
7813Searches for the separator sep in S, starting at the end of S, and returns\n\
7814the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007815separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007816
7817static PyObject*
7818unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7819{
7820 return PyUnicode_RPartition((PyObject *)self, separator);
7821}
7822
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007823PyObject *PyUnicode_RSplit(PyObject *s,
7824 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007825 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007826{
7827 PyObject *result;
7828
7829 s = PyUnicode_FromObject(s);
7830 if (s == NULL)
7831 return NULL;
7832 if (sep != NULL) {
7833 sep = PyUnicode_FromObject(sep);
7834 if (sep == NULL) {
7835 Py_DECREF(s);
7836 return NULL;
7837 }
7838 }
7839
7840 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7841
7842 Py_DECREF(s);
7843 Py_XDECREF(sep);
7844 return result;
7845}
7846
7847PyDoc_STRVAR(rsplit__doc__,
7848"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7849\n\
7850Return a list of the words in S, using sep as the\n\
7851delimiter string, starting at the end of the string and\n\
7852working to the front. If maxsplit is given, at most maxsplit\n\
7853splits are done. If sep is not specified, any whitespace string\n\
7854is a separator.");
7855
7856static PyObject*
7857unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7858{
7859 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007860 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007861
Martin v. Löwis18e16552006-02-15 17:27:45 +00007862 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007863 return NULL;
7864
7865 if (substring == Py_None)
7866 return rsplit(self, NULL, maxcount);
7867 else if (PyUnicode_Check(substring))
7868 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7869 else
7870 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7871}
7872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007873PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007874"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875\n\
7876Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007877Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007878is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879
7880static PyObject*
7881unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7882{
Guido van Rossum86662912000-04-11 15:38:46 +00007883 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884
Guido van Rossum86662912000-04-11 15:38:46 +00007885 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 return NULL;
7887
Guido van Rossum86662912000-04-11 15:38:46 +00007888 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889}
7890
7891static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007892PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893{
Walter Dörwald346737f2007-05-31 10:44:43 +00007894 if (PyUnicode_CheckExact(self)) {
7895 Py_INCREF(self);
7896 return self;
7897 } else
7898 /* Subtype -- return genuine unicode string with the same value. */
7899 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7900 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901}
7902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007903PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904"S.swapcase() -> unicode\n\
7905\n\
7906Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007907and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908
7909static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007910unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 return fixup(self, fixswapcase);
7913}
7914
Georg Brandlceee0772007-11-27 23:48:05 +00007915PyDoc_STRVAR(maketrans__doc__,
7916"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
7917\n\
7918Return a translation table usable for str.translate().\n\
7919If there is only one argument, it must be a dictionary mapping Unicode\n\
7920ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
7921Character keys will then be converted to ordinals.\n\
7922If there are two arguments, they must be strings of equal length, and\n\
7923in the resulting dictionary, each character in x will be mapped to the\n\
7924character at the same position in y. If there is a third argument, it\n\
7925must be a string, whose characters will be mapped to None in the result.");
7926
7927static PyObject*
7928unicode_maketrans(PyUnicodeObject *null, PyObject *args)
7929{
7930 PyObject *x, *y = NULL, *z = NULL;
7931 PyObject *new = NULL, *key, *value;
7932 Py_ssize_t i = 0;
7933 int res;
7934
7935 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
7936 return NULL;
7937 new = PyDict_New();
7938 if (!new)
7939 return NULL;
7940 if (y != NULL) {
7941 /* x must be a string too, of equal length */
7942 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
7943 if (!PyUnicode_Check(x)) {
7944 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
7945 "be a string if there is a second argument");
7946 goto err;
7947 }
7948 if (PyUnicode_GET_SIZE(x) != ylen) {
7949 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
7950 "arguments must have equal length");
7951 goto err;
7952 }
7953 /* create entries for translating chars in x to those in y */
7954 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007955 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
7956 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007957 if (!key || !value)
7958 goto err;
7959 res = PyDict_SetItem(new, key, value);
7960 Py_DECREF(key);
7961 Py_DECREF(value);
7962 if (res < 0)
7963 goto err;
7964 }
7965 /* create entries for deleting chars in z */
7966 if (z != NULL) {
7967 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007968 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007969 if (!key)
7970 goto err;
7971 res = PyDict_SetItem(new, key, Py_None);
7972 Py_DECREF(key);
7973 if (res < 0)
7974 goto err;
7975 }
7976 }
7977 } else {
7978 /* x must be a dict */
7979 if (!PyDict_Check(x)) {
7980 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
7981 "to maketrans it must be a dict");
7982 goto err;
7983 }
7984 /* copy entries into the new dict, converting string keys to int keys */
7985 while (PyDict_Next(x, &i, &key, &value)) {
7986 if (PyUnicode_Check(key)) {
7987 /* convert string keys to integer keys */
7988 PyObject *newkey;
7989 if (PyUnicode_GET_SIZE(key) != 1) {
7990 PyErr_SetString(PyExc_ValueError, "string keys in translate "
7991 "table must be of length 1");
7992 goto err;
7993 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007994 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00007995 if (!newkey)
7996 goto err;
7997 res = PyDict_SetItem(new, newkey, value);
7998 Py_DECREF(newkey);
7999 if (res < 0)
8000 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008001 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008002 /* just keep integer keys */
8003 if (PyDict_SetItem(new, key, value) < 0)
8004 goto err;
8005 } else {
8006 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8007 "be strings or integers");
8008 goto err;
8009 }
8010 }
8011 }
8012 return new;
8013 err:
8014 Py_DECREF(new);
8015 return NULL;
8016}
8017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008018PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019"S.translate(table) -> unicode\n\
8020\n\
8021Return a copy of the string S, where all characters have been mapped\n\
8022through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008023Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
8024Unmapped characters are left untouched. Characters mapped to None\n\
8025are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026
8027static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008028unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029{
Georg Brandlceee0772007-11-27 23:48:05 +00008030 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031}
8032
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008033PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034"S.upper() -> unicode\n\
8035\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008036Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037
8038static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008039unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 return fixup(self, fixupper);
8042}
8043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008044PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045"S.zfill(width) -> unicode\n\
8046\n\
8047Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008048of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049
8050static PyObject *
8051unicode_zfill(PyUnicodeObject *self, PyObject *args)
8052{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008053 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 PyUnicodeObject *u;
8055
Martin v. Löwis18e16552006-02-15 17:27:45 +00008056 Py_ssize_t width;
8057 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 return NULL;
8059
8060 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008061 if (PyUnicode_CheckExact(self)) {
8062 Py_INCREF(self);
8063 return (PyObject*) self;
8064 }
8065 else
8066 return PyUnicode_FromUnicode(
8067 PyUnicode_AS_UNICODE(self),
8068 PyUnicode_GET_SIZE(self)
8069 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 }
8071
8072 fill = width - self->length;
8073
8074 u = pad(self, fill, 0, '0');
8075
Walter Dörwald068325e2002-04-15 13:36:47 +00008076 if (u == NULL)
8077 return NULL;
8078
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 if (u->str[fill] == '+' || u->str[fill] == '-') {
8080 /* move sign to beginning of string */
8081 u->str[0] = u->str[fill];
8082 u->str[fill] = '0';
8083 }
8084
8085 return (PyObject*) u;
8086}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087
8088#if 0
8089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008090unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091{
Christian Heimes2202f872008-02-06 14:31:34 +00008092 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093}
8094#endif
8095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008096PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008097"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008099Return True if S starts with the specified prefix, False otherwise.\n\
8100With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008101With optional end, stop comparing S at that position.\n\
8102prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103
8104static PyObject *
8105unicode_startswith(PyUnicodeObject *self,
8106 PyObject *args)
8107{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008108 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008110 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008111 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008112 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008114 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008115 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008117 if (PyTuple_Check(subobj)) {
8118 Py_ssize_t i;
8119 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8120 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8121 PyTuple_GET_ITEM(subobj, i));
8122 if (substring == NULL)
8123 return NULL;
8124 result = tailmatch(self, substring, start, end, -1);
8125 Py_DECREF(substring);
8126 if (result) {
8127 Py_RETURN_TRUE;
8128 }
8129 }
8130 /* nothing matched */
8131 Py_RETURN_FALSE;
8132 }
8133 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008135 return NULL;
8136 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008138 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139}
8140
8141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008142PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008143"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008145Return True if S ends with the specified suffix, False otherwise.\n\
8146With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008147With optional end, stop comparing S at that position.\n\
8148suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149
8150static PyObject *
8151unicode_endswith(PyUnicodeObject *self,
8152 PyObject *args)
8153{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008154 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008157 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008158 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008160 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8161 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008163 if (PyTuple_Check(subobj)) {
8164 Py_ssize_t i;
8165 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8166 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8167 PyTuple_GET_ITEM(subobj, i));
8168 if (substring == NULL)
8169 return NULL;
8170 result = tailmatch(self, substring, start, end, +1);
8171 Py_DECREF(substring);
8172 if (result) {
8173 Py_RETURN_TRUE;
8174 }
8175 }
8176 Py_RETURN_FALSE;
8177 }
8178 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008180 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008182 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008184 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185}
8186
Eric Smith8c663262007-08-25 02:26:07 +00008187#include "stringlib/string_format.h"
8188
8189PyDoc_STRVAR(format__doc__,
8190"S.format(*args, **kwargs) -> unicode\n\
8191\n\
8192");
8193
Eric Smith8c663262007-08-25 02:26:07 +00008194PyDoc_STRVAR(p_format__doc__,
8195"S.__format__(format_spec) -> unicode\n\
8196\n\
8197");
8198
8199static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008200unicode_getnewargs(PyUnicodeObject *v)
8201{
8202 return Py_BuildValue("(u#)", v->str, v->length);
8203}
8204
8205
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206static PyMethodDef unicode_methods[] = {
8207
8208 /* Order is according to common usage: often used methods should
8209 appear first, since lookup is done sequentially. */
8210
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008211 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8212 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8213 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008214 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008215 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8216 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8217 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8218 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8219 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8220 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8221 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008222 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008223 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8224 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8225 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008226 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008227 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8228 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8229 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008230 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008231 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008232 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008233 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008234 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8235 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8236 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8237 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8238 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8239 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8240 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8241 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8242 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8243 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8244 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8245 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8246 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8247 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008248 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008249 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008250 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8251 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008252 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8253 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008254 {"maketrans", (PyCFunction) unicode_maketrans,
8255 METH_VARARGS | METH_STATIC, maketrans__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008256#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008257 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258#endif
8259
8260#if 0
8261 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008262 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263#endif
8264
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008265 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 {NULL, NULL}
8267};
8268
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008269static PyObject *
8270unicode_mod(PyObject *v, PyObject *w)
8271{
8272 if (!PyUnicode_Check(v)) {
8273 Py_INCREF(Py_NotImplemented);
8274 return Py_NotImplemented;
8275 }
8276 return PyUnicode_Format(v, w);
8277}
8278
8279static PyNumberMethods unicode_as_number = {
8280 0, /*nb_add*/
8281 0, /*nb_subtract*/
8282 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008283 unicode_mod, /*nb_remainder*/
8284};
8285
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008287 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008288 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008289 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8290 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008291 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 0, /* sq_ass_item */
8293 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008294 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295};
8296
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008297static PyObject*
8298unicode_subscript(PyUnicodeObject* self, PyObject* item)
8299{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008300 if (PyIndex_Check(item)) {
8301 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008302 if (i == -1 && PyErr_Occurred())
8303 return NULL;
8304 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008305 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008306 return unicode_getitem(self, i);
8307 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008308 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008309 Py_UNICODE* source_buf;
8310 Py_UNICODE* result_buf;
8311 PyObject* result;
8312
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008313 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008314 &start, &stop, &step, &slicelength) < 0) {
8315 return NULL;
8316 }
8317
8318 if (slicelength <= 0) {
8319 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008320 } else if (start == 0 && step == 1 && slicelength == self->length &&
8321 PyUnicode_CheckExact(self)) {
8322 Py_INCREF(self);
8323 return (PyObject *)self;
8324 } else if (step == 1) {
8325 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008326 } else {
8327 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008328 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8329 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008330
8331 if (result_buf == NULL)
8332 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008333
8334 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8335 result_buf[i] = source_buf[cur];
8336 }
Tim Petersced69f82003-09-16 20:30:58 +00008337
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008338 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008339 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008340 return result;
8341 }
8342 } else {
8343 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8344 return NULL;
8345 }
8346}
8347
8348static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008349 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008350 (binaryfunc)unicode_subscript, /* mp_subscript */
8351 (objobjargproc)0, /* mp_ass_subscript */
8352};
8353
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355/* Helpers for PyUnicode_Format() */
8356
8357static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008358getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008360 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 if (argidx < arglen) {
8362 (*p_argidx)++;
8363 if (arglen < 0)
8364 return args;
8365 else
8366 return PyTuple_GetItem(args, argidx);
8367 }
8368 PyErr_SetString(PyExc_TypeError,
8369 "not enough arguments for format string");
8370 return NULL;
8371}
8372
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008374strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008376 register Py_ssize_t i;
8377 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 for (i = len - 1; i >= 0; i--)
8379 buffer[i] = (Py_UNICODE) charbuffer[i];
8380
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 return len;
8382}
8383
Neal Norwitzfc76d632006-01-10 06:03:13 +00008384static int
8385doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8386{
Tim Peters15231542006-02-16 01:08:01 +00008387 Py_ssize_t result;
8388
Neal Norwitzfc76d632006-01-10 06:03:13 +00008389 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008390 result = strtounicode(buffer, (char *)buffer);
8391 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008392}
8393
Christian Heimes3fd13992008-03-21 01:05:49 +00008394#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008395static int
8396longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8397{
Tim Peters15231542006-02-16 01:08:01 +00008398 Py_ssize_t result;
8399
Neal Norwitzfc76d632006-01-10 06:03:13 +00008400 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008401 result = strtounicode(buffer, (char *)buffer);
8402 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008403}
Christian Heimes3fd13992008-03-21 01:05:49 +00008404#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008405
Guido van Rossum078151d2002-08-11 04:24:12 +00008406/* XXX To save some code duplication, formatfloat/long/int could have been
8407 shared with stringobject.c, converting from 8-bit to Unicode after the
8408 formatting is done. */
8409
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410static int
8411formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008412 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 int flags,
8414 int prec,
8415 int type,
8416 PyObject *v)
8417{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008418 /* fmt = '%#.' + `prec` + `type`
8419 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 char fmt[20];
8421 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008422
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 x = PyFloat_AsDouble(v);
8424 if (x == -1.0 && PyErr_Occurred())
8425 return -1;
8426 if (prec < 0)
8427 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8429 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008430 /* Worst case length calc to ensure no buffer overrun:
8431
8432 'g' formats:
8433 fmt = %#.<prec>g
8434 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8435 for any double rep.)
8436 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8437
8438 'f' formats:
8439 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8440 len = 1 + 50 + 1 + prec = 52 + prec
8441
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008442 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008443 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008444
8445 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008446 if (((type == 'g' || type == 'G') &&
8447 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008448 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008449 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008450 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008451 return -1;
8452 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008453 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8454 (flags&F_ALT) ? "#" : "",
8455 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008456 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457}
8458
Tim Peters38fd5b62000-09-21 05:43:11 +00008459static PyObject*
8460formatlong(PyObject *val, int flags, int prec, int type)
8461{
8462 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008463 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008464 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008465 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008466
8467 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8468 if (!str)
8469 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008470 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008471 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008472 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008473}
8474
Christian Heimes3fd13992008-03-21 01:05:49 +00008475#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476static int
8477formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008478 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 int flags,
8480 int prec,
8481 int type,
8482 PyObject *v)
8483{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008484 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008485 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8486 * + 1 + 1
8487 * = 24
8488 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008489 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008490 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491 long x;
8492
Christian Heimes217cfd12007-12-02 14:31:20 +00008493 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008495 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008496 if (x < 0 && type == 'u') {
8497 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008498 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008499 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8500 sign = "-";
8501 else
8502 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008504 prec = 1;
8505
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008506 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8507 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008508 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008509 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008510 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008511 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008512 return -1;
8513 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008514
8515 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008516 (type == 'x' || type == 'X' || type == 'o')) {
8517 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008518 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008519 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008520 * - when 0 is being converted, the C standard leaves off
8521 * the '0x' or '0X', which is inconsistent with other
8522 * %#x/%#X conversions and inconsistent with Python's
8523 * hex() function
8524 * - there are platforms that violate the standard and
8525 * convert 0 with the '0x' or '0X'
8526 * (Metrowerks, Compaq Tru64)
8527 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008528 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008529 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008530 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008531 * We can achieve the desired consistency by inserting our
8532 * own '0x' or '0X' prefix, and substituting %x/%X in place
8533 * of %#x/%#X.
8534 *
8535 * Note that this is the same approach as used in
8536 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008537 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008538 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8539 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008540 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008541 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008542 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8543 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008544 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008545 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008546 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008547 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008548 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008549 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550}
Christian Heimes3fd13992008-03-21 01:05:49 +00008551#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
8553static int
8554formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008555 size_t buflen,
8556 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008558 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008559 if (PyUnicode_Check(v)) {
8560 if (PyUnicode_GET_SIZE(v) != 1)
8561 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 else {
8565 /* Integer input truncated to a character */
8566 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008567 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008569 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008570#ifdef Py_UNICODE_WIDE
8571 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008572 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008573 "%c arg not in range(0x110000) "
8574 "(wide Python build)");
8575 return -1;
8576 }
8577#else
8578 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008579 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008580 "%c arg not in range(0x10000) "
8581 "(narrow Python build)");
8582 return -1;
8583 }
8584#endif
8585 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 }
8587 buf[1] = '\0';
8588 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008589
8590 onError:
8591 PyErr_SetString(PyExc_TypeError,
8592 "%c requires int or char");
8593 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594}
8595
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008596/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8597
8598 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8599 chars are formatted. XXX This is a magic number. Each formatting
8600 routine does bounds checking to ensure no overflow, but a better
8601 solution may be to malloc a buffer of appropriate size for each
8602 format. For now, the current solution is sufficient.
8603*/
8604#define FORMATBUFLEN (size_t)120
8605
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606PyObject *PyUnicode_Format(PyObject *format,
8607 PyObject *args)
8608{
8609 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008610 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 int args_owned = 0;
8612 PyUnicodeObject *result = NULL;
8613 PyObject *dict = NULL;
8614 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008615
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 if (format == NULL || args == NULL) {
8617 PyErr_BadInternalCall();
8618 return NULL;
8619 }
8620 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008621 if (uformat == NULL)
8622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 fmt = PyUnicode_AS_UNICODE(uformat);
8624 fmtcnt = PyUnicode_GET_SIZE(uformat);
8625
8626 reslen = rescnt = fmtcnt + 100;
8627 result = _PyUnicode_New(reslen);
8628 if (result == NULL)
8629 goto onError;
8630 res = PyUnicode_AS_UNICODE(result);
8631
8632 if (PyTuple_Check(args)) {
8633 arglen = PyTuple_Size(args);
8634 argidx = 0;
8635 }
8636 else {
8637 arglen = -1;
8638 argidx = -2;
8639 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008640 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008641 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 dict = args;
8643
8644 while (--fmtcnt >= 0) {
8645 if (*fmt != '%') {
8646 if (--rescnt < 0) {
8647 rescnt = fmtcnt + 100;
8648 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008649 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008650 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8652 --rescnt;
8653 }
8654 *res++ = *fmt++;
8655 }
8656 else {
8657 /* Got a format specifier */
8658 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008659 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 Py_UNICODE c = '\0';
8662 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008663 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664 PyObject *v = NULL;
8665 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008666 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008668 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008669 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
8671 fmt++;
8672 if (*fmt == '(') {
8673 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008674 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 PyObject *key;
8676 int pcount = 1;
8677
8678 if (dict == NULL) {
8679 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008680 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681 goto onError;
8682 }
8683 ++fmt;
8684 --fmtcnt;
8685 keystart = fmt;
8686 /* Skip over balanced parentheses */
8687 while (pcount > 0 && --fmtcnt >= 0) {
8688 if (*fmt == ')')
8689 --pcount;
8690 else if (*fmt == '(')
8691 ++pcount;
8692 fmt++;
8693 }
8694 keylen = fmt - keystart - 1;
8695 if (fmtcnt < 0 || pcount > 0) {
8696 PyErr_SetString(PyExc_ValueError,
8697 "incomplete format key");
8698 goto onError;
8699 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008700#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008701 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 then looked up since Python uses strings to hold
8703 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008704 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 key = PyUnicode_EncodeUTF8(keystart,
8706 keylen,
8707 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008708#else
8709 key = PyUnicode_FromUnicode(keystart, keylen);
8710#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 if (key == NULL)
8712 goto onError;
8713 if (args_owned) {
8714 Py_DECREF(args);
8715 args_owned = 0;
8716 }
8717 args = PyObject_GetItem(dict, key);
8718 Py_DECREF(key);
8719 if (args == NULL) {
8720 goto onError;
8721 }
8722 args_owned = 1;
8723 arglen = -1;
8724 argidx = -2;
8725 }
8726 while (--fmtcnt >= 0) {
8727 switch (c = *fmt++) {
8728 case '-': flags |= F_LJUST; continue;
8729 case '+': flags |= F_SIGN; continue;
8730 case ' ': flags |= F_BLANK; continue;
8731 case '#': flags |= F_ALT; continue;
8732 case '0': flags |= F_ZERO; continue;
8733 }
8734 break;
8735 }
8736 if (c == '*') {
8737 v = getnextarg(args, arglen, &argidx);
8738 if (v == NULL)
8739 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008740 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741 PyErr_SetString(PyExc_TypeError,
8742 "* wants int");
8743 goto onError;
8744 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008745 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008746 if (width == -1 && PyErr_Occurred())
8747 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 if (width < 0) {
8749 flags |= F_LJUST;
8750 width = -width;
8751 }
8752 if (--fmtcnt >= 0)
8753 c = *fmt++;
8754 }
8755 else if (c >= '0' && c <= '9') {
8756 width = c - '0';
8757 while (--fmtcnt >= 0) {
8758 c = *fmt++;
8759 if (c < '0' || c > '9')
8760 break;
8761 if ((width*10) / 10 != width) {
8762 PyErr_SetString(PyExc_ValueError,
8763 "width too big");
8764 goto onError;
8765 }
8766 width = width*10 + (c - '0');
8767 }
8768 }
8769 if (c == '.') {
8770 prec = 0;
8771 if (--fmtcnt >= 0)
8772 c = *fmt++;
8773 if (c == '*') {
8774 v = getnextarg(args, arglen, &argidx);
8775 if (v == NULL)
8776 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008777 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 PyErr_SetString(PyExc_TypeError,
8779 "* wants int");
8780 goto onError;
8781 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008782 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008783 if (prec == -1 && PyErr_Occurred())
8784 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 if (prec < 0)
8786 prec = 0;
8787 if (--fmtcnt >= 0)
8788 c = *fmt++;
8789 }
8790 else if (c >= '0' && c <= '9') {
8791 prec = c - '0';
8792 while (--fmtcnt >= 0) {
8793 c = Py_CHARMASK(*fmt++);
8794 if (c < '0' || c > '9')
8795 break;
8796 if ((prec*10) / 10 != prec) {
8797 PyErr_SetString(PyExc_ValueError,
8798 "prec too big");
8799 goto onError;
8800 }
8801 prec = prec*10 + (c - '0');
8802 }
8803 }
8804 } /* prec */
8805 if (fmtcnt >= 0) {
8806 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 if (--fmtcnt >= 0)
8808 c = *fmt++;
8809 }
8810 }
8811 if (fmtcnt < 0) {
8812 PyErr_SetString(PyExc_ValueError,
8813 "incomplete format");
8814 goto onError;
8815 }
8816 if (c != '%') {
8817 v = getnextarg(args, arglen, &argidx);
8818 if (v == NULL)
8819 goto onError;
8820 }
8821 sign = 0;
8822 fill = ' ';
8823 switch (c) {
8824
8825 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008826 pbuf = formatbuf;
8827 /* presume that buffer length is at least 1 */
8828 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 len = 1;
8830 break;
8831
8832 case 's':
8833 case 'r':
8834 if (PyUnicode_Check(v) && c == 's') {
8835 temp = v;
8836 Py_INCREF(temp);
8837 }
8838 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008840 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841 else
8842 temp = PyObject_Repr(v);
8843 if (temp == NULL)
8844 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008845 if (PyUnicode_Check(temp))
8846 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008847 else {
8848 Py_DECREF(temp);
8849 PyErr_SetString(PyExc_TypeError,
8850 "%s argument has non-string str()");
8851 goto onError;
8852 }
8853 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008854 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 len = PyUnicode_GET_SIZE(temp);
8856 if (prec >= 0 && len > prec)
8857 len = prec;
8858 break;
8859
8860 case 'i':
8861 case 'd':
8862 case 'u':
8863 case 'o':
8864 case 'x':
8865 case 'X':
8866 if (c == 'i')
8867 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00008868 isnumok = 0;
8869 if (PyNumber_Check(v)) {
8870 PyObject *iobj=NULL;
8871
8872 if (PyLong_Check(v)) {
8873 iobj = v;
8874 Py_INCREF(iobj);
8875 }
8876 else {
8877 iobj = PyNumber_Long(v);
8878 }
8879 if (iobj!=NULL) {
8880 if (PyLong_Check(iobj)) {
8881 isnumok = 1;
8882 temp = formatlong(iobj, flags, prec, c);
8883 Py_DECREF(iobj);
8884 if (!temp)
8885 goto onError;
8886 pbuf = PyUnicode_AS_UNICODE(temp);
8887 len = PyUnicode_GET_SIZE(temp);
8888 sign = 1;
8889 }
8890 else {
8891 Py_DECREF(iobj);
8892 }
8893 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894 }
Christian Heimesa612dc02008-02-24 13:08:18 +00008895 if (!isnumok) {
8896 PyErr_Format(PyExc_TypeError,
8897 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00008898 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008899 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008900 }
8901 if (flags & F_ZERO)
8902 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 break;
8904
8905 case 'e':
8906 case 'E':
8907 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008908 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008909 case 'g':
8910 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008911 if (c == 'F')
8912 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008913 pbuf = formatbuf;
8914 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8915 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 if (len < 0)
8917 goto onError;
8918 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008919 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920 fill = '0';
8921 break;
8922
8923 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008924 pbuf = formatbuf;
8925 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926 if (len < 0)
8927 goto onError;
8928 break;
8929
8930 default:
8931 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008932 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008933 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008934 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008935 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008936 (Py_ssize_t)(fmt - 1 -
8937 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 goto onError;
8939 }
8940 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008941 if (*pbuf == '-' || *pbuf == '+') {
8942 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 len--;
8944 }
8945 else if (flags & F_SIGN)
8946 sign = '+';
8947 else if (flags & F_BLANK)
8948 sign = ' ';
8949 else
8950 sign = 0;
8951 }
8952 if (width < len)
8953 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008954 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 reslen -= rescnt;
8956 rescnt = width + fmtcnt + 100;
8957 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008958 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008959 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008960 PyErr_NoMemory();
8961 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008962 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008963 if (_PyUnicode_Resize(&result, reslen) < 0) {
8964 Py_XDECREF(temp);
8965 goto onError;
8966 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 res = PyUnicode_AS_UNICODE(result)
8968 + reslen - rescnt;
8969 }
8970 if (sign) {
8971 if (fill != ' ')
8972 *res++ = sign;
8973 rescnt--;
8974 if (width > len)
8975 width--;
8976 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008977 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008978 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008979 assert(pbuf[1] == c);
8980 if (fill != ' ') {
8981 *res++ = *pbuf++;
8982 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008983 }
Tim Petersfff53252001-04-12 18:38:48 +00008984 rescnt -= 2;
8985 width -= 2;
8986 if (width < 0)
8987 width = 0;
8988 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 if (width > len && !(flags & F_LJUST)) {
8991 do {
8992 --rescnt;
8993 *res++ = fill;
8994 } while (--width > len);
8995 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008996 if (fill == ' ') {
8997 if (sign)
8998 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008999 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009000 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009001 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009002 *res++ = *pbuf++;
9003 *res++ = *pbuf++;
9004 }
9005 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009006 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007 res += len;
9008 rescnt -= len;
9009 while (--width >= len) {
9010 --rescnt;
9011 *res++ = ' ';
9012 }
9013 if (dict && (argidx < arglen) && c != '%') {
9014 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009015 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009016 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 goto onError;
9018 }
9019 Py_XDECREF(temp);
9020 } /* '%' */
9021 } /* until end */
9022 if (argidx < arglen && !dict) {
9023 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009024 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 goto onError;
9026 }
9027
Thomas Woutersa96affe2006-03-12 00:29:36 +00009028 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9029 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 if (args_owned) {
9031 Py_DECREF(args);
9032 }
9033 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 return (PyObject *)result;
9035
9036 onError:
9037 Py_XDECREF(result);
9038 Py_DECREF(uformat);
9039 if (args_owned) {
9040 Py_DECREF(args);
9041 }
9042 return NULL;
9043}
9044
Jeremy Hylton938ace62002-07-17 16:30:39 +00009045static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009046unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9047
Tim Peters6d6c1a32001-08-02 04:15:00 +00009048static PyObject *
9049unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9050{
9051 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009052 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009053 char *encoding = NULL;
9054 char *errors = NULL;
9055
Guido van Rossume023fe02001-08-30 03:12:59 +00009056 if (type != &PyUnicode_Type)
9057 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009058 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
9059 kwlist, &x, &encoding, &errors))
9060 return NULL;
9061 if (x == NULL)
9062 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009063 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009064 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009065 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009066 return PyUnicode_FromEncodedObject(x, encoding, errors);
9067}
9068
Guido van Rossume023fe02001-08-30 03:12:59 +00009069static PyObject *
9070unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9071{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009072 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009073 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009074
9075 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9076 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9077 if (tmp == NULL)
9078 return NULL;
9079 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009080 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009081 if (pnew == NULL) {
9082 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009083 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009084 }
Christian Heimesb186d002008-03-18 15:15:01 +00009085 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009086 if (pnew->str == NULL) {
9087 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009088 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009089 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009090 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009091 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009092 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9093 pnew->length = n;
9094 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009095 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009096 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009097}
9098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009099PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00009100"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009101\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009102Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009103encoding defaults to the current default string encoding.\n\
9104errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009105
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009106static PyObject *unicode_iter(PyObject *seq);
9107
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009109 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009110 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 sizeof(PyUnicodeObject), /* tp_size */
9112 0, /* tp_itemsize */
9113 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009114 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009116 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009118 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009119 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009120 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009122 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 (hashfunc) unicode_hash, /* tp_hash*/
9124 0, /* tp_call*/
9125 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009126 PyObject_GenericGetAttr, /* tp_getattro */
9127 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009128 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009129 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9130 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009131 unicode_doc, /* tp_doc */
9132 0, /* tp_traverse */
9133 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009134 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009135 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009136 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009137 0, /* tp_iternext */
9138 unicode_methods, /* tp_methods */
9139 0, /* tp_members */
9140 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009141 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009142 0, /* tp_dict */
9143 0, /* tp_descr_get */
9144 0, /* tp_descr_set */
9145 0, /* tp_dictoffset */
9146 0, /* tp_init */
9147 0, /* tp_alloc */
9148 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009149 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150};
9151
9152/* Initialize the Unicode implementation */
9153
Thomas Wouters78890102000-07-22 19:25:51 +00009154void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009156 int i;
9157
Thomas Wouters477c8d52006-05-27 19:21:47 +00009158 /* XXX - move this array to unicodectype.c ? */
9159 Py_UNICODE linebreak[] = {
9160 0x000A, /* LINE FEED */
9161 0x000D, /* CARRIAGE RETURN */
9162 0x001C, /* FILE SEPARATOR */
9163 0x001D, /* GROUP SEPARATOR */
9164 0x001E, /* RECORD SEPARATOR */
9165 0x0085, /* NEXT LINE */
9166 0x2028, /* LINE SEPARATOR */
9167 0x2029, /* PARAGRAPH SEPARATOR */
9168 };
9169
Fred Drakee4315f52000-05-09 19:53:39 +00009170 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009171 free_list = NULL;
9172 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009174 if (!unicode_empty)
9175 return;
9176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009177 for (i = 0; i < 256; i++)
9178 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009179 if (PyType_Ready(&PyUnicode_Type) < 0)
9180 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009181
9182 /* initialize the linebreak bloom filter */
9183 bloom_linebreak = make_bloom_mask(
9184 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9185 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009186
9187 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188}
9189
9190/* Finalize the Unicode implementation */
9191
Christian Heimesa156e092008-02-16 07:38:31 +00009192int
9193PyUnicode_ClearFreeList(void)
9194{
9195 int freelist_size = numfree;
9196 PyUnicodeObject *u;
9197
9198 for (u = free_list; u != NULL;) {
9199 PyUnicodeObject *v = u;
9200 u = *(PyUnicodeObject **)u;
9201 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009202 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009203 Py_XDECREF(v->defenc);
9204 PyObject_Del(v);
9205 numfree--;
9206 }
9207 free_list = NULL;
9208 assert(numfree == 0);
9209 return freelist_size;
9210}
9211
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212void
Thomas Wouters78890102000-07-22 19:25:51 +00009213_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009215 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009217 Py_XDECREF(unicode_empty);
9218 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009219
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009220 for (i = 0; i < 256; i++) {
9221 if (unicode_latin1[i]) {
9222 Py_DECREF(unicode_latin1[i]);
9223 unicode_latin1[i] = NULL;
9224 }
9225 }
Christian Heimesa156e092008-02-16 07:38:31 +00009226 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009228
Walter Dörwald16807132007-05-25 13:52:07 +00009229void
9230PyUnicode_InternInPlace(PyObject **p)
9231{
9232 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9233 PyObject *t;
9234 if (s == NULL || !PyUnicode_Check(s))
9235 Py_FatalError(
9236 "PyUnicode_InternInPlace: unicode strings only please!");
9237 /* If it's a subclass, we don't really know what putting
9238 it in the interned dict might do. */
9239 if (!PyUnicode_CheckExact(s))
9240 return;
9241 if (PyUnicode_CHECK_INTERNED(s))
9242 return;
9243 if (interned == NULL) {
9244 interned = PyDict_New();
9245 if (interned == NULL) {
9246 PyErr_Clear(); /* Don't leave an exception */
9247 return;
9248 }
9249 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009250 /* It might be that the GetItem call fails even
9251 though the key is present in the dictionary,
9252 namely when this happens during a stack overflow. */
9253 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009254 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009255 Py_END_ALLOW_RECURSION
9256
Walter Dörwald16807132007-05-25 13:52:07 +00009257 if (t) {
9258 Py_INCREF(t);
9259 Py_DECREF(*p);
9260 *p = t;
9261 return;
9262 }
9263
Martin v. Löwis5b222132007-06-10 09:51:05 +00009264 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009265 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9266 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009267 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009268 return;
9269 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009270 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009271 /* The two references in interned are not counted by refcnt.
9272 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009273 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009274 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9275}
9276
9277void
9278PyUnicode_InternImmortal(PyObject **p)
9279{
9280 PyUnicode_InternInPlace(p);
9281 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9282 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9283 Py_INCREF(*p);
9284 }
9285}
9286
9287PyObject *
9288PyUnicode_InternFromString(const char *cp)
9289{
9290 PyObject *s = PyUnicode_FromString(cp);
9291 if (s == NULL)
9292 return NULL;
9293 PyUnicode_InternInPlace(&s);
9294 return s;
9295}
9296
9297void _Py_ReleaseInternedUnicodeStrings(void)
9298{
9299 PyObject *keys;
9300 PyUnicodeObject *s;
9301 Py_ssize_t i, n;
9302 Py_ssize_t immortal_size = 0, mortal_size = 0;
9303
9304 if (interned == NULL || !PyDict_Check(interned))
9305 return;
9306 keys = PyDict_Keys(interned);
9307 if (keys == NULL || !PyList_Check(keys)) {
9308 PyErr_Clear();
9309 return;
9310 }
9311
9312 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9313 detector, interned unicode strings are not forcibly deallocated;
9314 rather, we give them their stolen references back, and then clear
9315 and DECREF the interned dict. */
9316
9317 n = PyList_GET_SIZE(keys);
9318 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9319 n);
9320 for (i = 0; i < n; i++) {
9321 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9322 switch (s->state) {
9323 case SSTATE_NOT_INTERNED:
9324 /* XXX Shouldn't happen */
9325 break;
9326 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009327 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009328 immortal_size += s->length;
9329 break;
9330 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009331 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009332 mortal_size += s->length;
9333 break;
9334 default:
9335 Py_FatalError("Inconsistent interned string state.");
9336 }
9337 s->state = SSTATE_NOT_INTERNED;
9338 }
9339 fprintf(stderr, "total size of all interned strings: "
9340 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9341 "mortal/immortal\n", mortal_size, immortal_size);
9342 Py_DECREF(keys);
9343 PyDict_Clear(interned);
9344 Py_DECREF(interned);
9345 interned = NULL;
9346}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009347
9348
9349/********************* Unicode Iterator **************************/
9350
9351typedef struct {
9352 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009353 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009354 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9355} unicodeiterobject;
9356
9357static void
9358unicodeiter_dealloc(unicodeiterobject *it)
9359{
9360 _PyObject_GC_UNTRACK(it);
9361 Py_XDECREF(it->it_seq);
9362 PyObject_GC_Del(it);
9363}
9364
9365static int
9366unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9367{
9368 Py_VISIT(it->it_seq);
9369 return 0;
9370}
9371
9372static PyObject *
9373unicodeiter_next(unicodeiterobject *it)
9374{
9375 PyUnicodeObject *seq;
9376 PyObject *item;
9377
9378 assert(it != NULL);
9379 seq = it->it_seq;
9380 if (seq == NULL)
9381 return NULL;
9382 assert(PyUnicode_Check(seq));
9383
9384 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009385 item = PyUnicode_FromUnicode(
9386 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009387 if (item != NULL)
9388 ++it->it_index;
9389 return item;
9390 }
9391
9392 Py_DECREF(seq);
9393 it->it_seq = NULL;
9394 return NULL;
9395}
9396
9397static PyObject *
9398unicodeiter_len(unicodeiterobject *it)
9399{
9400 Py_ssize_t len = 0;
9401 if (it->it_seq)
9402 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009403 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009404}
9405
9406PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9407
9408static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009409 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9410 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009411 {NULL, NULL} /* sentinel */
9412};
9413
9414PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009415 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009416 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009417 sizeof(unicodeiterobject), /* tp_basicsize */
9418 0, /* tp_itemsize */
9419 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009420 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009421 0, /* tp_print */
9422 0, /* tp_getattr */
9423 0, /* tp_setattr */
9424 0, /* tp_compare */
9425 0, /* tp_repr */
9426 0, /* tp_as_number */
9427 0, /* tp_as_sequence */
9428 0, /* tp_as_mapping */
9429 0, /* tp_hash */
9430 0, /* tp_call */
9431 0, /* tp_str */
9432 PyObject_GenericGetAttr, /* tp_getattro */
9433 0, /* tp_setattro */
9434 0, /* tp_as_buffer */
9435 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9436 0, /* tp_doc */
9437 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9438 0, /* tp_clear */
9439 0, /* tp_richcompare */
9440 0, /* tp_weaklistoffset */
9441 PyObject_SelfIter, /* tp_iter */
9442 (iternextfunc)unicodeiter_next, /* tp_iternext */
9443 unicodeiter_methods, /* tp_methods */
9444 0,
9445};
9446
9447static PyObject *
9448unicode_iter(PyObject *seq)
9449{
9450 unicodeiterobject *it;
9451
9452 if (!PyUnicode_Check(seq)) {
9453 PyErr_BadInternalCall();
9454 return NULL;
9455 }
9456 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9457 if (it == NULL)
9458 return NULL;
9459 it->it_index = 0;
9460 Py_INCREF(seq);
9461 it->it_seq = (PyUnicodeObject *)seq;
9462 _PyObject_GC_TRACK(it);
9463 return (PyObject *)it;
9464}
9465
Martin v. Löwis5b222132007-06-10 09:51:05 +00009466size_t
9467Py_UNICODE_strlen(const Py_UNICODE *u)
9468{
9469 int res = 0;
9470 while(*u++)
9471 res++;
9472 return res;
9473}
9474
9475Py_UNICODE*
9476Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9477{
9478 Py_UNICODE *u = s1;
9479 while ((*u++ = *s2++));
9480 return s1;
9481}
9482
9483Py_UNICODE*
9484Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9485{
9486 Py_UNICODE *u = s1;
9487 while ((*u++ = *s2++))
9488 if (n-- == 0)
9489 break;
9490 return s1;
9491}
9492
9493int
9494Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9495{
9496 while (*s1 && *s2 && *s1 == *s2)
9497 s1++, s2++;
9498 if (*s1 && *s2)
9499 return (*s1 < *s2) ? -1 : +1;
9500 if (*s1)
9501 return 1;
9502 if (*s2)
9503 return -1;
9504 return 0;
9505}
9506
9507Py_UNICODE*
9508Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9509{
9510 const Py_UNICODE *p;
9511 for (p = s; *p; p++)
9512 if (*p == c)
9513 return (Py_UNICODE*)p;
9514 return NULL;
9515}
9516
9517
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009518#ifdef __cplusplus
9519}
9520#endif
9521
9522
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009523/*
9524Local variables:
9525c-basic-offset: 4
9526indent-tabs-mode: nil
9527End:
9528*/