blob: dcad80896a15c3b8a8addd1c9451d4110993e4c5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
Christian Heimes2202f872008-02-06 14:31:34 +000057#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Christian Heimes2202f872008-02-06 14:31:34 +000065 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000109static PyUnicodeObject *free_list;
110static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes190d79e2008-01-30 11:58:22 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131// case 0x0009: /* HORIZONTAL TABULATION */
132// case 0x000A: /* LINE FEED */
133// case 0x000B: /* VERTICAL TABULATION */
134// case 0x000C: /* FORM FEED */
135// case 0x000D: /* CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138// case 0x001C: /* FILE SEPARATOR */
139// case 0x001D: /* GROUP SEPARATOR */
140// case 0x001E: /* RECORD SEPARATOR */
141// case 0x001F: /* UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143// case 0x0020: /* SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162// 0x000A, /* LINE FEED */
163// 0x000D, /* CARRIAGE RETURN */
164 0, 0, 1, 0, 0, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166// 0x001C, /* FILE SEPARATOR */
167// 0x001D, /* GROUP SEPARATOR */
168// 0x001E, /* RECORD SEPARATOR */
169 0, 0, 0, 0, 1, 1, 1, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0
183};
184
185
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000187PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000189#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190 return 0x10FFFF;
191#else
192 /* This is actually an illegal character, so it should
193 not be passed to unichr. */
194 return 0xFFFF;
195#endif
196}
197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198/* --- Bloom Filters ----------------------------------------------------- */
199
200/* stuff to implement simple "bloom filters" for Unicode characters.
201 to keep things simple, we use a single bitmask, using the least 5
202 bits from each unicode characters as the bit index. */
203
204/* the linebreak mask is set up by Unicode_Init below */
205
206#define BLOOM_MASK unsigned long
207
208static BLOOM_MASK bloom_linebreak;
209
210#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
211
Christian Heimes190d79e2008-01-30 11:58:22 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
220 long mask;
221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
225 mask |= (1 << (ptr[i] & 0x1F));
226
227 return mask;
228}
229
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
238 return 0;
239}
240
241#define BLOOM_MEMBER(mask, chr, set, setlen)\
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 if (unicode == unicode_empty ||
261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
275 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
276 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_NoMemory();
279 return -1;
280 }
281 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000284 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000286 if (unicode->defenc) {
287 Py_DECREF(unicode->defenc);
288 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 }
290 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return 0;
293}
294
295/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000296 Ux0000 terminated; some code (e.g. new_identifier)
297 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
299 XXX This allocator could further be enhanced by assuring that the
300 free list never reduces its size below 1.
301
302*/
303
304static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306{
307 register PyUnicodeObject *unicode;
308
Thomas Wouters477c8d52006-05-27 19:21:47 +0000309 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 if (length == 0 && unicode_empty != NULL) {
311 Py_INCREF(unicode_empty);
312 return unicode_empty;
313 }
314
315 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000316 if (free_list) {
317 unicode = free_list;
318 free_list = *(PyUnicodeObject **)unicode;
319 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000321 /* Keep-Alive optimization: we only upsize the buffer,
322 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000323 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000324 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000326 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 }
328 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000329 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000331 }
332 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 }
334 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000335 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 if (unicode == NULL)
337 return NULL;
338 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
339 }
340
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000341 if (!unicode->str) {
342 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000343 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000344 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000345 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000346 * the caller fails before initializing str -- unicode_resize()
347 * reads str[0], and the Keep-Alive optimization can keep memory
348 * allocated for str alive across a call to unicode_dealloc(unicode).
349 * We don't want unicode_resize to read uninitialized memory in
350 * that case.
351 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000354 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000356 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000357 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359
360 onError:
361 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000362 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000363 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364}
365
366static
Guido van Rossum9475a232001-10-05 20:51:39 +0000367void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368{
Walter Dörwald16807132007-05-25 13:52:07 +0000369 switch (PyUnicode_CHECK_INTERNED(unicode)) {
370 case SSTATE_NOT_INTERNED:
371 break;
372
373 case SSTATE_INTERNED_MORTAL:
374 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000375 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000376 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
377 Py_FatalError(
378 "deletion of interned unicode string failed");
379 break;
380
381 case SSTATE_INTERNED_IMMORTAL:
382 Py_FatalError("Immortal interned unicode string died.");
383
384 default:
385 Py_FatalError("Inconsistent interned unicode string state.");
386 }
387
Guido van Rossum604ddf82001-12-06 20:03:56 +0000388 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000389 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000390 /* Keep-Alive optimization */
391 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000392 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393 unicode->str = NULL;
394 unicode->length = 0;
395 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000396 if (unicode->defenc) {
397 Py_DECREF(unicode->defenc);
398 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 }
400 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000401 *(PyUnicodeObject **)unicode = free_list;
402 free_list = unicode;
403 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404 }
405 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000406 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000407 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000408 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000409 }
410}
411
Martin v. Löwis18e16552006-02-15 17:27:45 +0000412int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000413{
414 register PyUnicodeObject *v;
415
416 /* Argument checks */
417 if (unicode == NULL) {
418 PyErr_BadInternalCall();
419 return -1;
420 }
421 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000422 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000423 PyErr_BadInternalCall();
424 return -1;
425 }
426
427 /* Resizing unicode_empty and single character objects is not
428 possible since these are being shared. We simply return a fresh
429 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000430 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000431 (v == unicode_empty || v->length == 1)) {
432 PyUnicodeObject *w = _PyUnicode_New(length);
433 if (w == NULL)
434 return -1;
435 Py_UNICODE_COPY(w->str, v->str,
436 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000437 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000438 *unicode = (PyObject *)w;
439 return 0;
440 }
441
442 /* Note that we don't have to modify *unicode for unshared Unicode
443 objects, since we can modify them in-place. */
444 return unicode_resize(v, length);
445}
446
447/* Internal API for use in unicodeobject.c only ! */
448#define _PyUnicode_Resize(unicodevar, length) \
449 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
450
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000452 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453{
454 PyUnicodeObject *unicode;
455
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000456 /* If the Unicode data is known at construction time, we can apply
457 some optimizations which share commonly used objects. */
458 if (u != NULL) {
459
460 /* Optimization for empty strings */
461 if (size == 0 && unicode_empty != NULL) {
462 Py_INCREF(unicode_empty);
463 return (PyObject *)unicode_empty;
464 }
465
466 /* Single character Unicode objects in the Latin-1 range are
467 shared when using this constructor */
468 if (size == 1 && *u < 256) {
469 unicode = unicode_latin1[*u];
470 if (!unicode) {
471 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 if (!unicode)
473 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000474 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000475 unicode_latin1[*u] = unicode;
476 }
477 Py_INCREF(unicode);
478 return (PyObject *)unicode;
479 }
480 }
Tim Petersced69f82003-09-16 20:30:58 +0000481
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482 unicode = _PyUnicode_New(size);
483 if (!unicode)
484 return NULL;
485
486 /* Copy the Unicode data into the new object */
487 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000488 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489
490 return (PyObject *)unicode;
491}
492
Walter Dörwaldd2034312007-05-18 16:29:38 +0000493PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000494{
495 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000496 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000497 some optimizations which share commonly used objects.
498 Also, this means the input must be UTF-8, so fall back to the
499 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000500 if (u != NULL) {
501
502 /* Optimization for empty strings */
503 if (size == 0 && unicode_empty != NULL) {
504 Py_INCREF(unicode_empty);
505 return (PyObject *)unicode_empty;
506 }
507
Martin v. Löwis9c121062007-08-05 20:26:11 +0000508 /* Single characters are shared when using this constructor.
509 Restrict to ASCII, since the input must be UTF-8. */
510 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000511 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000512 if (!unicode) {
513 unicode = _PyUnicode_New(1);
514 if (!unicode)
515 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000516 unicode->str[0] = Py_CHARMASK(*u);
517 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518 }
519 Py_INCREF(unicode);
520 return (PyObject *)unicode;
521 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000522
523 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 }
525
Walter Dörwald55507312007-05-18 13:12:10 +0000526 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 if (!unicode)
528 return NULL;
529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 return (PyObject *)unicode;
531}
532
Walter Dörwaldd2034312007-05-18 16:29:38 +0000533PyObject *PyUnicode_FromString(const char *u)
534{
535 size_t size = strlen(u);
536 if (size > PY_SSIZE_T_MAX) {
537 PyErr_SetString(PyExc_OverflowError, "input too long");
538 return NULL;
539 }
540
541 return PyUnicode_FromStringAndSize(u, size);
542}
543
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544#ifdef HAVE_WCHAR_H
545
546PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000547 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548{
549 PyUnicodeObject *unicode;
550
551 if (w == NULL) {
552 PyErr_BadInternalCall();
553 return NULL;
554 }
555
556 unicode = _PyUnicode_New(size);
557 if (!unicode)
558 return NULL;
559
560 /* Copy the wchar_t data into the new object */
561#ifdef HAVE_USABLE_WCHAR_T
562 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000563#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564 {
565 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000566 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000568 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000569 *u++ = *w++;
570 }
571#endif
572
573 return (PyObject *)unicode;
574}
575
Walter Dörwald346737f2007-05-31 10:44:43 +0000576static void
577makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
578{
579 *fmt++ = '%';
580 if (width) {
581 if (zeropad)
582 *fmt++ = '0';
583 fmt += sprintf(fmt, "%d", width);
584 }
585 if (precision)
586 fmt += sprintf(fmt, ".%d", precision);
587 if (longflag)
588 *fmt++ = 'l';
589 else if (size_tflag) {
590 char *f = PY_FORMAT_SIZE_T;
591 while (*f)
592 *fmt++ = *f++;
593 }
594 *fmt++ = c;
595 *fmt = '\0';
596}
597
Walter Dörwaldd2034312007-05-18 16:29:38 +0000598#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
599
600PyObject *
601PyUnicode_FromFormatV(const char *format, va_list vargs)
602{
603 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000604 Py_ssize_t callcount = 0;
605 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000606 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000607 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000608 int width = 0;
609 int precision = 0;
610 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000611 const char* f;
612 Py_UNICODE *s;
613 PyObject *string;
614 /* used by sprintf */
615 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000616 /* use abuffer instead of buffer, if we need more space
617 * (which can happen if there's a format specifier with width). */
618 char *abuffer = NULL;
619 char *realbuffer;
620 Py_ssize_t abuffersize = 0;
621 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 const char *copy;
623
624#ifdef VA_LIST_IS_ARRAY
625 Py_MEMCPY(count, vargs, sizeof(va_list));
626#else
627#ifdef __va_copy
628 __va_copy(count, vargs);
629#else
630 count = vargs;
631#endif
632#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000633 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000634 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000635 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000636 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000637 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000638 ++callcount;
639 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000640 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000641 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000642 if (callcount) {
643 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
644 if (!callresults) {
645 PyErr_NoMemory();
646 return NULL;
647 }
648 callresult = callresults;
649 }
650 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000651 for (f = format; *f; f++) {
652 if (*f == '%') {
653 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000654 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000655 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000656 width = (width*10) + *f++ - '0';
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000657 while (*++f && *f != '%' && !ISALPHA(*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000658 ;
659
660 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
661 * they don't affect the amount of space we reserve.
662 */
663 if ((*f == 'l' || *f == 'z') &&
664 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000665 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000666
667 switch (*f) {
668 case 'c':
669 (void)va_arg(count, int);
670 /* fall through... */
671 case '%':
672 n++;
673 break;
674 case 'd': case 'u': case 'i': case 'x':
675 (void) va_arg(count, int);
676 /* 20 bytes is enough to hold a 64-bit
677 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000678 This isn't enough for octal.
679 If a width is specified we need more
680 (which we allocate later). */
681 if (width < 20)
682 width = 20;
683 n += width;
684 if (abuffersize < width)
685 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000686 break;
687 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000688 {
689 /* UTF-8 */
690 unsigned char*s;
691 s = va_arg(count, unsigned char*);
692 while (*s) {
693 if (*s < 128) {
694 n++; s++;
695 } else if (*s < 0xc0) {
696 /* invalid UTF-8 */
697 n++; s++;
698 } else if (*s < 0xc0) {
699 n++;
700 s++; if(!*s)break;
701 s++;
702 } else if (*s < 0xe0) {
703 n++;
704 s++; if(!*s)break;
705 s++; if(!*s)break;
706 s++;
707 } else {
708 #ifdef Py_UNICODE_WIDE
709 n++;
710 #else
711 n+=2;
712 #endif
713 s++; if(!*s)break;
714 s++; if(!*s)break;
715 s++; if(!*s)break;
716 s++;
717 }
718 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000720 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000721 case 'U':
722 {
723 PyObject *obj = va_arg(count, PyObject *);
724 assert(obj && PyUnicode_Check(obj));
725 n += PyUnicode_GET_SIZE(obj);
726 break;
727 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000728 case 'V':
729 {
730 PyObject *obj = va_arg(count, PyObject *);
731 const char *str = va_arg(count, const char *);
732 assert(obj || str);
733 assert(!obj || PyUnicode_Check(obj));
734 if (obj)
735 n += PyUnicode_GET_SIZE(obj);
736 else
737 n += strlen(str);
738 break;
739 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000740 case 'S':
741 {
742 PyObject *obj = va_arg(count, PyObject *);
743 PyObject *str;
744 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000745 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000746 if (!str)
747 goto fail;
748 n += PyUnicode_GET_SIZE(str);
749 /* Remember the str and switch to the next slot */
750 *callresult++ = str;
751 break;
752 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000753 case 'R':
754 {
755 PyObject *obj = va_arg(count, PyObject *);
756 PyObject *repr;
757 assert(obj);
758 repr = PyObject_Repr(obj);
759 if (!repr)
760 goto fail;
761 n += PyUnicode_GET_SIZE(repr);
762 /* Remember the repr and switch to the next slot */
763 *callresult++ = repr;
764 break;
765 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 case 'p':
767 (void) va_arg(count, int);
768 /* maximum 64-bit pointer representation:
769 * 0xffffffffffffffff
770 * so 19 characters is enough.
771 * XXX I count 18 -- what's the extra for?
772 */
773 n += 19;
774 break;
775 default:
776 /* if we stumble upon an unknown
777 formatting code, copy the rest of
778 the format string to the output
779 string. (we cannot just skip the
780 code, since there's no way to know
781 what's in the argument list) */
782 n += strlen(p);
783 goto expand;
784 }
785 } else
786 n++;
787 }
788 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000789 if (abuffersize > 20) {
790 abuffer = PyMem_Malloc(abuffersize);
791 if (!abuffer) {
792 PyErr_NoMemory();
793 goto fail;
794 }
795 realbuffer = abuffer;
796 }
797 else
798 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000799 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000800 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000801 we don't have to resize the string.
802 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000803 string = PyUnicode_FromUnicode(NULL, n);
804 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000805 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000806
807 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000808 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
810 for (f = format; *f; f++) {
811 if (*f == '%') {
812 const char* p = f++;
813 int longflag = 0;
814 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000815 zeropad = (*f == '0');
816 /* parse the width.precision part */
817 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000818 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000819 width = (width*10) + *f++ - '0';
820 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000821 if (*f == '.') {
822 f++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000823 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000824 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000825 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000826 /* handle the long flag, but only for %ld and %lu.
827 others can be added when necessary. */
828 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
829 longflag = 1;
830 ++f;
831 }
832 /* handle the size_t flag. */
833 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
834 size_tflag = 1;
835 ++f;
836 }
837
838 switch (*f) {
839 case 'c':
840 *s++ = va_arg(vargs, int);
841 break;
842 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000844 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000845 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000846 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000847 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000849 sprintf(realbuffer, fmt, va_arg(vargs, int));
850 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000851 break;
852 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000854 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000855 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000856 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000858 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
860 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 break;
862 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000863 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
864 sprintf(realbuffer, fmt, va_arg(vargs, int));
865 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000866 break;
867 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000868 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
869 sprintf(realbuffer, fmt, va_arg(vargs, int));
870 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000871 break;
872 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000873 {
874 /* Parameter must be UTF-8 encoded.
875 In case of encoding errors, use
876 the replacement character. */
877 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000878 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000879 u = PyUnicode_DecodeUTF8(p, strlen(p),
880 "replace");
881 if (!u)
882 goto fail;
883 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
884 PyUnicode_GET_SIZE(u));
885 s += PyUnicode_GET_SIZE(u);
886 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000888 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000889 case 'U':
890 {
891 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000892 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
893 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
894 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000895 break;
896 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000897 case 'V':
898 {
899 PyObject *obj = va_arg(vargs, PyObject *);
900 const char *str = va_arg(vargs, const char *);
901 if (obj) {
902 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
903 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
904 s += size;
905 } else {
906 appendstring(str);
907 }
908 break;
909 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000910 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000911 case 'R':
912 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000913 Py_UNICODE *ucopy;
914 Py_ssize_t usize;
915 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000916 /* unused, since we already have the result */
917 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000918 ucopy = PyUnicode_AS_UNICODE(*callresult);
919 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000920 for (upos = 0; upos<usize;)
921 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000922 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000923 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000924 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000925 ++callresult;
926 break;
927 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000928 case 'p':
929 sprintf(buffer, "%p", va_arg(vargs, void*));
930 /* %p is ill-defined: ensure leading 0x. */
931 if (buffer[1] == 'X')
932 buffer[1] = 'x';
933 else if (buffer[1] != 'x') {
934 memmove(buffer+2, buffer, strlen(buffer)+1);
935 buffer[0] = '0';
936 buffer[1] = 'x';
937 }
938 appendstring(buffer);
939 break;
940 case '%':
941 *s++ = '%';
942 break;
943 default:
944 appendstring(p);
945 goto end;
946 }
947 } else
948 *s++ = *f;
949 }
950
951 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000952 if (callresults)
953 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000954 if (abuffer)
955 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000956 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
957 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000958 fail:
959 if (callresults) {
960 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000961 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000962 Py_DECREF(*callresult2);
963 ++callresult2;
964 }
965 PyMem_Free(callresults);
966 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000967 if (abuffer)
968 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000969 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970}
971
972#undef appendstring
973
974PyObject *
975PyUnicode_FromFormat(const char *format, ...)
976{
977 PyObject* ret;
978 va_list vargs;
979
980#ifdef HAVE_STDARG_PROTOTYPES
981 va_start(vargs, format);
982#else
983 va_start(vargs);
984#endif
985 ret = PyUnicode_FromFormatV(format, vargs);
986 va_end(vargs);
987 return ret;
988}
989
Martin v. Löwis18e16552006-02-15 17:27:45 +0000990Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
991 wchar_t *w,
992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000993{
994 if (unicode == NULL) {
995 PyErr_BadInternalCall();
996 return -1;
997 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000998
999 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001000 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001001 size = PyUnicode_GET_SIZE(unicode) + 1;
1002
Guido van Rossumd57fd912000-03-10 22:53:23 +00001003#ifdef HAVE_USABLE_WCHAR_T
1004 memcpy(w, unicode->str, size * sizeof(wchar_t));
1005#else
1006 {
1007 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001008 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001010 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 *w++ = *u++;
1012 }
1013#endif
1014
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001015 if (size > PyUnicode_GET_SIZE(unicode))
1016 return PyUnicode_GET_SIZE(unicode);
1017 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001018 return size;
1019}
1020
1021#endif
1022
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001023PyObject *PyUnicode_FromOrdinal(int ordinal)
1024{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001025 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001026
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001027 if (ordinal < 0 || ordinal > 0x10ffff) {
1028 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001029 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001030 return NULL;
1031 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001032
1033#ifndef Py_UNICODE_WIDE
1034 if (ordinal > 0xffff) {
1035 ordinal -= 0x10000;
1036 s[0] = 0xD800 | (ordinal >> 10);
1037 s[1] = 0xDC00 | (ordinal & 0x3FF);
1038 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001039 }
1040#endif
1041
Hye-Shik Chang40574832004-04-06 07:24:51 +00001042 s[0] = (Py_UNICODE)ordinal;
1043 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001044}
1045
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046PyObject *PyUnicode_FromObject(register PyObject *obj)
1047{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001048 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001049 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001050 if (PyUnicode_CheckExact(obj)) {
1051 Py_INCREF(obj);
1052 return obj;
1053 }
1054 if (PyUnicode_Check(obj)) {
1055 /* For a Unicode subtype that's not a Unicode object,
1056 return a true Unicode object with the same data. */
1057 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1058 PyUnicode_GET_SIZE(obj));
1059 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001060 PyErr_Format(PyExc_TypeError,
1061 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001062 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001063 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001064}
1065
1066PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1067 const char *encoding,
1068 const char *errors)
1069{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001070 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001071 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001072 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001073
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 if (obj == NULL) {
1075 PyErr_BadInternalCall();
1076 return NULL;
1077 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001078
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079 if (PyUnicode_Check(obj)) {
1080 PyErr_SetString(PyExc_TypeError,
1081 "decoding Unicode is not supported");
1082 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001084
1085 /* Coerce object */
1086 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001087 s = PyString_AS_STRING(obj);
1088 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001089 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1091 /* Overwrite the error message with something more useful in
1092 case of a TypeError. */
1093 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001094 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001095 "coercing to Unicode: need string or buffer, "
1096 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001097 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001098 goto onError;
1099 }
Tim Petersced69f82003-09-16 20:30:58 +00001100
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001101 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 if (len == 0) {
1103 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001104 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 }
Tim Petersced69f82003-09-16 20:30:58 +00001106 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001107 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001108
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 return v;
1110
1111 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113}
1114
1115PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001116 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 const char *encoding,
1118 const char *errors)
1119{
1120 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001121 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001122 char lower[20]; /* Enough for any encoding name we recognize */
1123 char *l;
1124 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001125
1126 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001127 encoding = PyUnicode_GetDefaultEncoding();
1128
1129 /* Convert encoding to lower case and replace '_' with '-' in order to
1130 catch e.g. UTF_8 */
1131 e = encoding;
1132 l = lower;
1133 while (*e && l < &lower[(sizeof lower) - 2]) {
1134 if (ISUPPER(*e)) {
1135 *l++ = TOLOWER(*e++);
1136 }
1137 else if (*e == '_') {
1138 *l++ = '-';
1139 e++;
1140 }
1141 else {
1142 *l++ = *e++;
1143 }
1144 }
1145 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001146
1147 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001148 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001150 else if ((strcmp(lower, "latin-1") == 0) ||
1151 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001152 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001153#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001154 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001155 return PyUnicode_DecodeMBCS(s, size, errors);
1156#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001157 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001158 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001159 else if (strcmp(lower, "utf-16") == 0)
1160 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1161 else if (strcmp(lower, "utf-32") == 0)
1162 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163
1164 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001165 buffer = NULL;
1166 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1167 goto onError;
1168 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 if (buffer == NULL)
1170 goto onError;
1171 unicode = PyCodec_Decode(buffer, encoding, errors);
1172 if (unicode == NULL)
1173 goto onError;
1174 if (!PyUnicode_Check(unicode)) {
1175 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001176 "decoder did not return an unicode object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001177 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 Py_DECREF(unicode);
1179 goto onError;
1180 }
1181 Py_DECREF(buffer);
1182 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001183
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 onError:
1185 Py_XDECREF(buffer);
1186 return NULL;
1187}
1188
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001189PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1190 const char *encoding,
1191 const char *errors)
1192{
1193 PyObject *v;
1194
1195 if (!PyUnicode_Check(unicode)) {
1196 PyErr_BadArgument();
1197 goto onError;
1198 }
1199
1200 if (encoding == NULL)
1201 encoding = PyUnicode_GetDefaultEncoding();
1202
1203 /* Decode via the codec registry */
1204 v = PyCodec_Decode(unicode, encoding, errors);
1205 if (v == NULL)
1206 goto onError;
1207 return v;
1208
1209 onError:
1210 return NULL;
1211}
1212
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001214 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 const char *encoding,
1216 const char *errors)
1217{
1218 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001219
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 unicode = PyUnicode_FromUnicode(s, size);
1221 if (unicode == NULL)
1222 return NULL;
1223 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1224 Py_DECREF(unicode);
1225 return v;
1226}
1227
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001228PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1229 const char *encoding,
1230 const char *errors)
1231{
1232 PyObject *v;
1233
1234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadArgument();
1236 goto onError;
1237 }
1238
1239 if (encoding == NULL)
1240 encoding = PyUnicode_GetDefaultEncoding();
1241
1242 /* Encode via the codec registry */
1243 v = PyCodec_Encode(unicode, encoding, errors);
1244 if (v == NULL)
1245 goto onError;
1246 return v;
1247
1248 onError:
1249 return NULL;
1250}
1251
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1253 const char *encoding,
1254 const char *errors)
1255{
1256 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001257
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 if (!PyUnicode_Check(unicode)) {
1259 PyErr_BadArgument();
1260 goto onError;
1261 }
Fred Drakee4315f52000-05-09 19:53:39 +00001262
Tim Petersced69f82003-09-16 20:30:58 +00001263 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001264 encoding = PyUnicode_GetDefaultEncoding();
1265
1266 /* Shortcuts for common default encodings */
1267 if (errors == NULL) {
1268 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001269 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001270 else if (strcmp(encoding, "latin-1") == 0)
1271 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001272#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1273 else if (strcmp(encoding, "mbcs") == 0)
1274 return PyUnicode_AsMBCSString(unicode);
1275#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001276 else if (strcmp(encoding, "ascii") == 0)
1277 return PyUnicode_AsASCIIString(unicode);
1278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279
1280 /* Encode via the codec registry */
1281 v = PyCodec_Encode(unicode, encoding, errors);
1282 if (v == NULL)
1283 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001284 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001286
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287 onError:
1288 return NULL;
1289}
1290
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001291PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1292 const char *errors)
1293{
1294 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001295 if (v)
1296 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001297 if (errors != NULL)
1298 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001299 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001300 PyUnicode_GET_SIZE(unicode),
1301 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001302 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001303 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001304 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001305 return v;
1306}
1307
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001308PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001309PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001310 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001311 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1312}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001313
Christian Heimes5894ba72007-11-04 11:43:14 +00001314PyObject*
1315PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1316{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001317 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1318 can be undefined. If it is case, decode using UTF-8. The following assumes
1319 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1320 bootstrapping process where the codecs aren't ready yet.
1321 */
1322 if (Py_FileSystemDefaultEncoding) {
1323#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001324 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001325 return PyUnicode_DecodeMBCS(s, size, "replace");
1326 }
1327#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001328 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001329 return PyUnicode_DecodeUTF8(s, size, "replace");
1330 }
1331#endif
1332 return PyUnicode_Decode(s, size,
1333 Py_FileSystemDefaultEncoding,
1334 "replace");
1335 }
1336 else {
1337 return PyUnicode_DecodeUTF8(s, size, "replace");
1338 }
1339}
1340
Martin v. Löwis5b222132007-06-10 09:51:05 +00001341char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001342PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001343{
Christian Heimesf3863112007-11-22 07:46:41 +00001344 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 return NULL;
1348 }
Christian Heimesf3863112007-11-22 07:46:41 +00001349 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1350 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001351 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001352 if (psize != NULL)
Christian Heimesf3863112007-11-22 07:46:41 +00001353 *psize = PyString_GET_SIZE(bytes);
1354 return PyString_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001355}
1356
1357char*
1358PyUnicode_AsString(PyObject *unicode)
1359{
1360 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001361}
1362
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1364{
1365 if (!PyUnicode_Check(unicode)) {
1366 PyErr_BadArgument();
1367 goto onError;
1368 }
1369 return PyUnicode_AS_UNICODE(unicode);
1370
1371 onError:
1372 return NULL;
1373}
1374
Martin v. Löwis18e16552006-02-15 17:27:45 +00001375Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376{
1377 if (!PyUnicode_Check(unicode)) {
1378 PyErr_BadArgument();
1379 goto onError;
1380 }
1381 return PyUnicode_GET_SIZE(unicode);
1382
1383 onError:
1384 return -1;
1385}
1386
Thomas Wouters78890102000-07-22 19:25:51 +00001387const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001388{
1389 return unicode_default_encoding;
1390}
1391
1392int PyUnicode_SetDefaultEncoding(const char *encoding)
1393{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001394 if (strcmp(encoding, unicode_default_encoding) != 0) {
1395 PyErr_Format(PyExc_ValueError,
1396 "Can only set default encoding to %s",
1397 unicode_default_encoding);
1398 return -1;
1399 }
Fred Drakee4315f52000-05-09 19:53:39 +00001400 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001401}
1402
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403/* error handling callback helper:
1404 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001405 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406 and adjust various state variables.
1407 return 0 on success, -1 on error
1408*/
1409
1410static
1411int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1412 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001413 const char **input, const char **inend, Py_ssize_t *startinpos,
1414 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001417 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001418
1419 PyObject *restuple = NULL;
1420 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001422 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t requiredsize;
1424 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001425 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001426 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001427 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001428 int res = -1;
1429
1430 if (*errorHandler == NULL) {
1431 *errorHandler = PyCodec_LookupError(errors);
1432 if (*errorHandler == NULL)
1433 goto onError;
1434 }
1435
1436 if (*exceptionObject == NULL) {
1437 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001438 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001439 if (*exceptionObject == NULL)
1440 goto onError;
1441 }
1442 else {
1443 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1444 goto onError;
1445 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1446 goto onError;
1447 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1448 goto onError;
1449 }
1450
1451 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1452 if (restuple == NULL)
1453 goto onError;
1454 if (!PyTuple_Check(restuple)) {
1455 PyErr_Format(PyExc_TypeError, &argparse[4]);
1456 goto onError;
1457 }
1458 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1459 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001460
1461 /* Copy back the bytes variables, which might have been modified by the
1462 callback */
1463 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1464 if (!inputobj)
1465 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001466 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001467 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1468 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001469 *input = PyString_AS_STRING(inputobj);
1470 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001471 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001472 /* we can DECREF safely, as the exception has another reference,
1473 so the object won't go away. */
1474 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001476 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001477 newpos = insize+newpos;
1478 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001479 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001480 goto onError;
1481 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482
1483 /* need more space? (at least enough for what we
1484 have+the replacement+the rest of the string (starting
1485 at the new input position), so we won't have to check space
1486 when there are no errors in the rest of the string) */
1487 repptr = PyUnicode_AS_UNICODE(repunicode);
1488 repsize = PyUnicode_GET_SIZE(repunicode);
1489 requiredsize = *outpos + repsize + insize-newpos;
1490 if (requiredsize > outsize) {
1491 if (requiredsize<2*outsize)
1492 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001493 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 goto onError;
1495 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1496 }
1497 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001498 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499 Py_UNICODE_COPY(*outptr, repptr, repsize);
1500 *outptr += repsize;
1501 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 /* we made it! */
1504 res = 0;
1505
1506 onError:
1507 Py_XDECREF(restuple);
1508 return res;
1509}
1510
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001511/* --- UTF-7 Codec -------------------------------------------------------- */
1512
1513/* see RFC2152 for details */
1514
Tim Petersced69f82003-09-16 20:30:58 +00001515static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001516char utf7_special[128] = {
1517 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1518 encoded:
1519 0 - not special
1520 1 - special
1521 2 - whitespace (optional)
1522 3 - RFC2152 Set O (optional) */
1523 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1524 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1525 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1527 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1529 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1531
1532};
1533
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001534/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1535 warnings about the comparison always being false; since
1536 utf7_special[0] is 1, we can safely make that one comparison
1537 true */
1538
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001539#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001540 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001541 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001542 (encodeO && (utf7_special[(c)] == 3)))
1543
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001544#define B64(n) \
1545 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1546#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001547 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001548#define UB64(c) \
1549 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1550 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001552#define ENCODE(out, ch, bits) \
1553 while (bits >= 6) { \
1554 *out++ = B64(ch >> (bits-6)); \
1555 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556 }
1557
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001558#define DECODE(out, ch, bits, surrogate) \
1559 while (bits >= 16) { \
1560 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1561 bits -= 16; \
1562 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001563 /* We have already generated an error for the high surrogate \
1564 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001565 surrogate = 0; \
1566 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001568 it in a 16-bit character */ \
1569 surrogate = 1; \
1570 errmsg = "code pairs are not supported"; \
1571 goto utf7Error; \
1572 } else { \
1573 *out++ = outCh; \
1574 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001575 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 const char *errors)
1580{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001581 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1582}
1583
1584PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1585 Py_ssize_t size,
1586 const char *errors,
1587 Py_ssize_t *consumed)
1588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001590 Py_ssize_t startinpos;
1591 Py_ssize_t endinpos;
1592 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 const char *e;
1594 PyUnicodeObject *unicode;
1595 Py_UNICODE *p;
1596 const char *errmsg = "";
1597 int inShift = 0;
1598 unsigned int bitsleft = 0;
1599 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 int surrogate = 0;
1601 PyObject *errorHandler = NULL;
1602 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603
1604 unicode = _PyUnicode_New(size);
1605 if (!unicode)
1606 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001607 if (size == 0) {
1608 if (consumed)
1609 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001611 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001612
1613 p = unicode->str;
1614 e = s + size;
1615
1616 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 Py_UNICODE ch;
1618 restart:
1619 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620
1621 if (inShift) {
1622 if ((ch == '-') || !B64CHAR(ch)) {
1623 inShift = 0;
1624 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001625
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1627 if (bitsleft >= 6) {
1628 /* The shift sequence has a partial character in it. If
1629 bitsleft < 6 then we could just classify it as padding
1630 but that is not the case here */
1631
1632 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001633 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634 }
1635 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001636 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637 here so indicate the potential of a misencoded character. */
1638
1639 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1640 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1641 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001642 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001643 }
1644
1645 if (ch == '-') {
1646 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001647 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 inShift = 1;
1649 }
1650 } else if (SPECIAL(ch,0,0)) {
1651 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001652 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 } else {
1654 *p++ = ch;
1655 }
1656 } else {
1657 charsleft = (charsleft << 6) | UB64(ch);
1658 bitsleft += 6;
1659 s++;
1660 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1661 }
1662 }
1663 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 s++;
1666 if (s < e && *s == '-') {
1667 s++;
1668 *p++ = '+';
1669 } else
1670 {
1671 inShift = 1;
1672 bitsleft = 0;
1673 }
1674 }
1675 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001676 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 errmsg = "unexpected special character";
1678 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001679 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 }
1681 else {
1682 *p++ = ch;
1683 s++;
1684 }
1685 continue;
1686 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001687 outpos = p-PyUnicode_AS_UNICODE(unicode);
1688 endinpos = s-starts;
1689 if (unicode_decode_call_errorhandler(
1690 errors, &errorHandler,
1691 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001692 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001693 (PyObject **)&unicode, &outpos, &p))
1694 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 }
1696
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001697 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 outpos = p-PyUnicode_AS_UNICODE(unicode);
1699 endinpos = size;
1700 if (unicode_decode_call_errorhandler(
1701 errors, &errorHandler,
1702 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001703 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001705 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 if (s < e)
1707 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001709 if (consumed) {
1710 if(inShift)
1711 *consumed = startinpos;
1712 else
1713 *consumed = s-starts;
1714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001716 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001717 goto onError;
1718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 Py_XDECREF(errorHandler);
1720 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001721 return (PyObject *)unicode;
1722
1723onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001724 Py_XDECREF(errorHandler);
1725 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 Py_DECREF(unicode);
1727 return NULL;
1728}
1729
1730
1731PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001732 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001733 int encodeSetO,
1734 int encodeWhiteSpace,
1735 const char *errors)
1736{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001737 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001739 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001741 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742 unsigned int bitsleft = 0;
1743 unsigned long charsleft = 0;
1744 char * out;
1745 char * start;
1746
1747 if (size == 0)
Christian Heimesf3863112007-11-22 07:46:41 +00001748 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749
Walter Dörwald51ab4142007-05-05 14:43:36 +00001750 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751 if (v == NULL)
1752 return NULL;
1753
Walter Dörwald51ab4142007-05-05 14:43:36 +00001754 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 for (;i < size; ++i) {
1756 Py_UNICODE ch = s[i];
1757
1758 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001759 if (ch == '+') {
1760 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 *out++ = '-';
1762 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1763 charsleft = ch;
1764 bitsleft = 16;
1765 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001766 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001768 } else {
1769 *out++ = (char) ch;
1770 }
1771 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001772 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1773 *out++ = B64(charsleft << (6-bitsleft));
1774 charsleft = 0;
1775 bitsleft = 0;
1776 /* Characters not in the BASE64 set implicitly unshift the sequence
1777 so no '-' is required, except if the character is itself a '-' */
1778 if (B64CHAR(ch) || ch == '-') {
1779 *out++ = '-';
1780 }
1781 inShift = 0;
1782 *out++ = (char) ch;
1783 } else {
1784 bitsleft += 16;
1785 charsleft = (charsleft << 16) | ch;
1786 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1787
1788 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001789 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001790 or '-' then the shift sequence will be terminated implicitly and we
1791 don't have to insert a '-'. */
1792
1793 if (bitsleft == 0) {
1794 if (i + 1 < size) {
1795 Py_UNICODE ch2 = s[i+1];
1796
1797 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001798
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001799 } else if (B64CHAR(ch2) || ch2 == '-') {
1800 *out++ = '-';
1801 inShift = 0;
1802 } else {
1803 inShift = 0;
1804 }
1805
1806 }
1807 else {
1808 *out++ = '-';
1809 inShift = 0;
1810 }
1811 }
Tim Petersced69f82003-09-16 20:30:58 +00001812 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001813 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001814 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001815 if (bitsleft) {
1816 *out++= B64(charsleft << (6-bitsleft) );
1817 *out++ = '-';
1818 }
1819
Guido van Rossum98297ee2007-11-06 21:34:58 +00001820 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1821 Py_DECREF(v);
1822 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001823}
1824
1825#undef SPECIAL
1826#undef B64
1827#undef B64CHAR
1828#undef UB64
1829#undef ENCODE
1830#undef DECODE
1831
Guido van Rossumd57fd912000-03-10 22:53:23 +00001832/* --- UTF-8 Codec -------------------------------------------------------- */
1833
Tim Petersced69f82003-09-16 20:30:58 +00001834static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835char utf8_code_length[256] = {
1836 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1837 illegal prefix. see RFC 2279 for details */
1838 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1839 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1840 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1841 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1842 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1844 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1846 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1850 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1851 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1852 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1853 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1854};
1855
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001857 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 const char *errors)
1859{
Walter Dörwald69652032004-09-07 20:24:22 +00001860 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1861}
1862
1863PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001864 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001865 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001866 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001867{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001870 Py_ssize_t startinpos;
1871 Py_ssize_t endinpos;
1872 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873 const char *e;
1874 PyUnicodeObject *unicode;
1875 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001876 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 PyObject *errorHandler = NULL;
1878 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879
1880 /* Note: size will always be longer than the resulting Unicode
1881 character count */
1882 unicode = _PyUnicode_New(size);
1883 if (!unicode)
1884 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001885 if (size == 0) {
1886 if (consumed)
1887 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890
1891 /* Unpack UTF-8 encoded data */
1892 p = unicode->str;
1893 e = s + size;
1894
1895 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001896 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897
1898 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001899 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900 s++;
1901 continue;
1902 }
1903
1904 n = utf8_code_length[ch];
1905
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001907 if (consumed)
1908 break;
1909 else {
1910 errmsg = "unexpected end of data";
1911 startinpos = s-starts;
1912 endinpos = size;
1913 goto utf8Error;
1914 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001916
1917 switch (n) {
1918
1919 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001920 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001921 startinpos = s-starts;
1922 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001923 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924
1925 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001926 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 startinpos = s-starts;
1928 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001929 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930
1931 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001932 if ((s[1] & 0xc0) != 0x80) {
1933 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 startinpos = s-starts;
1935 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 goto utf8Error;
1937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001939 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 startinpos = s-starts;
1941 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001942 errmsg = "illegal encoding";
1943 goto utf8Error;
1944 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001946 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 break;
1948
1949 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001950 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001951 (s[2] & 0xc0) != 0x80) {
1952 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001953 startinpos = s-starts;
1954 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001955 goto utf8Error;
1956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001958 if (ch < 0x0800) {
1959 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001960 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001961
1962 XXX For wide builds (UCS-4) we should probably try
1963 to recombine the surrogates into a single code
1964 unit.
1965 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001966 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001967 startinpos = s-starts;
1968 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001969 goto utf8Error;
1970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001972 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001973 break;
1974
1975 case 4:
1976 if ((s[1] & 0xc0) != 0x80 ||
1977 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001978 (s[3] & 0xc0) != 0x80) {
1979 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001980 startinpos = s-starts;
1981 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001982 goto utf8Error;
1983 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001984 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1985 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1986 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001987 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001988 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001989 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001990 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001991 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001992 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001993 startinpos = s-starts;
1994 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001995 goto utf8Error;
1996 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001997#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001998 *p++ = (Py_UNICODE)ch;
1999#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002000 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002001
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002002 /* translate from 10000..10FFFF to 0..FFFF */
2003 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002004
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002005 /* high surrogate = top 10 bits added to D800 */
2006 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002007
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002008 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002009 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002010#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 break;
2012
2013 default:
2014 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002015 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016 startinpos = s-starts;
2017 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002018 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 }
2020 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002021 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002022
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002023 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002024 outpos = p-PyUnicode_AS_UNICODE(unicode);
2025 if (unicode_decode_call_errorhandler(
2026 errors, &errorHandler,
2027 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002028 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 (PyObject **)&unicode, &outpos, &p))
2030 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 }
Walter Dörwald69652032004-09-07 20:24:22 +00002032 if (consumed)
2033 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
2035 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002036 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 goto onError;
2038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039 Py_XDECREF(errorHandler);
2040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 return (PyObject *)unicode;
2042
2043onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 Py_XDECREF(errorHandler);
2045 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 Py_DECREF(unicode);
2047 return NULL;
2048}
2049
Tim Peters602f7402002-04-27 18:03:26 +00002050/* Allocation strategy: if the string is short, convert into a stack buffer
2051 and allocate exactly as much space needed at the end. Else allocate the
2052 maximum possible needed (4 result bytes per Unicode character), and return
2053 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002054*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002055PyObject *
2056PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002057 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059{
Tim Peters602f7402002-04-27 18:03:26 +00002060#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002061
Guido van Rossum98297ee2007-11-06 21:34:58 +00002062 Py_ssize_t i; /* index into s of next input byte */
2063 PyObject *result; /* result string object */
2064 char *p; /* next free byte in output buffer */
2065 Py_ssize_t nallocated; /* number of result bytes allocated */
2066 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002067 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002068
Tim Peters602f7402002-04-27 18:03:26 +00002069 assert(s != NULL);
2070 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071
Tim Peters602f7402002-04-27 18:03:26 +00002072 if (size <= MAX_SHORT_UNICHARS) {
2073 /* Write into the stack buffer; nallocated can't overflow.
2074 * At the end, we'll allocate exactly as much heap space as it
2075 * turns out we need.
2076 */
2077 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002078 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002079 p = stackbuf;
2080 }
2081 else {
2082 /* Overallocate on the heap, and give the excess back at the end. */
2083 nallocated = size * 4;
2084 if (nallocated / 4 != size) /* overflow! */
2085 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002086 result = PyString_FromStringAndSize(NULL, nallocated);
2087 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002088 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002089 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002090 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002091
Tim Peters602f7402002-04-27 18:03:26 +00002092 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002093 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002094
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002095 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002096 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002098
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002100 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002101 *p++ = (char)(0xc0 | (ch >> 6));
2102 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002103 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002104 else {
Tim Peters602f7402002-04-27 18:03:26 +00002105 /* Encode UCS2 Unicode ordinals */
2106 if (ch < 0x10000) {
2107 /* Special case: check for high surrogate */
2108 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2109 Py_UCS4 ch2 = s[i];
2110 /* Check for low surrogate and combine the two to
2111 form a UCS4 value */
2112 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002113 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002114 i++;
2115 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002116 }
Tim Peters602f7402002-04-27 18:03:26 +00002117 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002118 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002119 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002120 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2121 *p++ = (char)(0x80 | (ch & 0x3f));
2122 continue;
2123 }
2124encodeUCS4:
2125 /* Encode UCS4 Unicode ordinals */
2126 *p++ = (char)(0xf0 | (ch >> 18));
2127 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2128 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2129 *p++ = (char)(0x80 | (ch & 0x3f));
2130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002132
Guido van Rossum98297ee2007-11-06 21:34:58 +00002133 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002134 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002135 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002136 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002137 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002138 }
2139 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002140 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002141 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002142 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002143 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002144 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002145 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002146
Tim Peters602f7402002-04-27 18:03:26 +00002147#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148}
2149
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2151{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 if (!PyUnicode_Check(unicode)) {
2153 PyErr_BadArgument();
2154 return NULL;
2155 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002156 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2157 PyUnicode_GET_SIZE(unicode),
2158 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159}
2160
Walter Dörwald41980ca2007-08-16 21:55:45 +00002161/* --- UTF-32 Codec ------------------------------------------------------- */
2162
2163PyObject *
2164PyUnicode_DecodeUTF32(const char *s,
2165 Py_ssize_t size,
2166 const char *errors,
2167 int *byteorder)
2168{
2169 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2170}
2171
2172PyObject *
2173PyUnicode_DecodeUTF32Stateful(const char *s,
2174 Py_ssize_t size,
2175 const char *errors,
2176 int *byteorder,
2177 Py_ssize_t *consumed)
2178{
2179 const char *starts = s;
2180 Py_ssize_t startinpos;
2181 Py_ssize_t endinpos;
2182 Py_ssize_t outpos;
2183 PyUnicodeObject *unicode;
2184 Py_UNICODE *p;
2185#ifndef Py_UNICODE_WIDE
2186 int i, pairs;
2187#else
2188 const int pairs = 0;
2189#endif
2190 const unsigned char *q, *e;
2191 int bo = 0; /* assume native ordering by default */
2192 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002193 /* Offsets from q for retrieving bytes in the right order. */
2194#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2195 int iorder[] = {0, 1, 2, 3};
2196#else
2197 int iorder[] = {3, 2, 1, 0};
2198#endif
2199 PyObject *errorHandler = NULL;
2200 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002201 /* On narrow builds we split characters outside the BMP into two
2202 codepoints => count how much extra space we need. */
2203#ifndef Py_UNICODE_WIDE
2204 for (i = pairs = 0; i < size/4; i++)
2205 if (((Py_UCS4 *)s)[i] >= 0x10000)
2206 pairs++;
2207#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002208
2209 /* This might be one to much, because of a BOM */
2210 unicode = _PyUnicode_New((size+3)/4+pairs);
2211 if (!unicode)
2212 return NULL;
2213 if (size == 0)
2214 return (PyObject *)unicode;
2215
2216 /* Unpack UTF-32 encoded data */
2217 p = unicode->str;
2218 q = (unsigned char *)s;
2219 e = q + size;
2220
2221 if (byteorder)
2222 bo = *byteorder;
2223
2224 /* Check for BOM marks (U+FEFF) in the input and adjust current
2225 byte order setting accordingly. In native mode, the leading BOM
2226 mark is skipped, in all other modes, it is copied to the output
2227 stream as-is (giving a ZWNBSP character). */
2228 if (bo == 0) {
2229 if (size >= 4) {
2230 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2231 (q[iorder[1]] << 8) | q[iorder[0]];
2232#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2233 if (bom == 0x0000FEFF) {
2234 q += 4;
2235 bo = -1;
2236 }
2237 else if (bom == 0xFFFE0000) {
2238 q += 4;
2239 bo = 1;
2240 }
2241#else
2242 if (bom == 0x0000FEFF) {
2243 q += 4;
2244 bo = 1;
2245 }
2246 else if (bom == 0xFFFE0000) {
2247 q += 4;
2248 bo = -1;
2249 }
2250#endif
2251 }
2252 }
2253
2254 if (bo == -1) {
2255 /* force LE */
2256 iorder[0] = 0;
2257 iorder[1] = 1;
2258 iorder[2] = 2;
2259 iorder[3] = 3;
2260 }
2261 else if (bo == 1) {
2262 /* force BE */
2263 iorder[0] = 3;
2264 iorder[1] = 2;
2265 iorder[2] = 1;
2266 iorder[3] = 0;
2267 }
2268
2269 while (q < e) {
2270 Py_UCS4 ch;
2271 /* remaining bytes at the end? (size should be divisible by 4) */
2272 if (e-q<4) {
2273 if (consumed)
2274 break;
2275 errmsg = "truncated data";
2276 startinpos = ((const char *)q)-starts;
2277 endinpos = ((const char *)e)-starts;
2278 goto utf32Error;
2279 /* The remaining input chars are ignored if the callback
2280 chooses to skip the input */
2281 }
2282 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2283 (q[iorder[1]] << 8) | q[iorder[0]];
2284
2285 if (ch >= 0x110000)
2286 {
2287 errmsg = "codepoint not in range(0x110000)";
2288 startinpos = ((const char *)q)-starts;
2289 endinpos = startinpos+4;
2290 goto utf32Error;
2291 }
2292#ifndef Py_UNICODE_WIDE
2293 if (ch >= 0x10000)
2294 {
2295 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2296 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2297 }
2298 else
2299#endif
2300 *p++ = ch;
2301 q += 4;
2302 continue;
2303 utf32Error:
2304 outpos = p-PyUnicode_AS_UNICODE(unicode);
2305 if (unicode_decode_call_errorhandler(
2306 errors, &errorHandler,
2307 "utf32", errmsg,
2308 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2309 (PyObject **)&unicode, &outpos, &p))
2310 goto onError;
2311 }
2312
2313 if (byteorder)
2314 *byteorder = bo;
2315
2316 if (consumed)
2317 *consumed = (const char *)q-starts;
2318
2319 /* Adjust length */
2320 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2321 goto onError;
2322
2323 Py_XDECREF(errorHandler);
2324 Py_XDECREF(exc);
2325 return (PyObject *)unicode;
2326
2327onError:
2328 Py_DECREF(unicode);
2329 Py_XDECREF(errorHandler);
2330 Py_XDECREF(exc);
2331 return NULL;
2332}
2333
2334PyObject *
2335PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2336 Py_ssize_t size,
2337 const char *errors,
2338 int byteorder)
2339{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002340 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002341 unsigned char *p;
2342#ifndef Py_UNICODE_WIDE
2343 int i, pairs;
2344#else
2345 const int pairs = 0;
2346#endif
2347 /* Offsets from p for storing byte pairs in the right order. */
2348#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2349 int iorder[] = {0, 1, 2, 3};
2350#else
2351 int iorder[] = {3, 2, 1, 0};
2352#endif
2353
2354#define STORECHAR(CH) \
2355 do { \
2356 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2357 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2358 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2359 p[iorder[0]] = (CH) & 0xff; \
2360 p += 4; \
2361 } while(0)
2362
2363 /* In narrow builds we can output surrogate pairs as one codepoint,
2364 so we need less space. */
2365#ifndef Py_UNICODE_WIDE
2366 for (i = pairs = 0; i < size-1; i++)
2367 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2368 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2369 pairs++;
2370#endif
2371 v = PyBytes_FromStringAndSize(NULL,
2372 4 * (size - pairs + (byteorder == 0)));
2373 if (v == NULL)
2374 return NULL;
2375
2376 p = (unsigned char *)PyBytes_AS_STRING(v);
2377 if (byteorder == 0)
2378 STORECHAR(0xFEFF);
2379 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002380 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002381
2382 if (byteorder == -1) {
2383 /* force LE */
2384 iorder[0] = 0;
2385 iorder[1] = 1;
2386 iorder[2] = 2;
2387 iorder[3] = 3;
2388 }
2389 else if (byteorder == 1) {
2390 /* force BE */
2391 iorder[0] = 3;
2392 iorder[1] = 2;
2393 iorder[2] = 1;
2394 iorder[3] = 0;
2395 }
2396
2397 while (size-- > 0) {
2398 Py_UCS4 ch = *s++;
2399#ifndef Py_UNICODE_WIDE
2400 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2401 Py_UCS4 ch2 = *s;
2402 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2403 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2404 s++;
2405 size--;
2406 }
2407 }
2408#endif
2409 STORECHAR(ch);
2410 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002411
2412 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002413 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002414 Py_DECREF(v);
2415 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002416#undef STORECHAR
2417}
2418
2419PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2420{
2421 if (!PyUnicode_Check(unicode)) {
2422 PyErr_BadArgument();
2423 return NULL;
2424 }
2425 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2426 PyUnicode_GET_SIZE(unicode),
2427 NULL,
2428 0);
2429}
2430
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431/* --- UTF-16 Codec ------------------------------------------------------- */
2432
Tim Peters772747b2001-08-09 22:21:55 +00002433PyObject *
2434PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002435 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002436 const char *errors,
2437 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438{
Walter Dörwald69652032004-09-07 20:24:22 +00002439 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2440}
2441
2442PyObject *
2443PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002444 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002445 const char *errors,
2446 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002447 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002448{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002449 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002450 Py_ssize_t startinpos;
2451 Py_ssize_t endinpos;
2452 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 PyUnicodeObject *unicode;
2454 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002455 const unsigned char *q, *e;
2456 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002458 /* Offsets from q for retrieving byte pairs in the right order. */
2459#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2460 int ihi = 1, ilo = 0;
2461#else
2462 int ihi = 0, ilo = 1;
2463#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 PyObject *errorHandler = NULL;
2465 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466
2467 /* Note: size will always be longer than the resulting Unicode
2468 character count */
2469 unicode = _PyUnicode_New(size);
2470 if (!unicode)
2471 return NULL;
2472 if (size == 0)
2473 return (PyObject *)unicode;
2474
2475 /* Unpack UTF-16 encoded data */
2476 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002477 q = (unsigned char *)s;
2478 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479
2480 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002481 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002483 /* Check for BOM marks (U+FEFF) in the input and adjust current
2484 byte order setting accordingly. In native mode, the leading BOM
2485 mark is skipped, in all other modes, it is copied to the output
2486 stream as-is (giving a ZWNBSP character). */
2487 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002488 if (size >= 2) {
2489 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002490#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002491 if (bom == 0xFEFF) {
2492 q += 2;
2493 bo = -1;
2494 }
2495 else if (bom == 0xFFFE) {
2496 q += 2;
2497 bo = 1;
2498 }
Tim Petersced69f82003-09-16 20:30:58 +00002499#else
Walter Dörwald69652032004-09-07 20:24:22 +00002500 if (bom == 0xFEFF) {
2501 q += 2;
2502 bo = 1;
2503 }
2504 else if (bom == 0xFFFE) {
2505 q += 2;
2506 bo = -1;
2507 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002508#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002509 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511
Tim Peters772747b2001-08-09 22:21:55 +00002512 if (bo == -1) {
2513 /* force LE */
2514 ihi = 1;
2515 ilo = 0;
2516 }
2517 else if (bo == 1) {
2518 /* force BE */
2519 ihi = 0;
2520 ilo = 1;
2521 }
2522
2523 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002525 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002526 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002527 if (consumed)
2528 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002529 errmsg = "truncated data";
2530 startinpos = ((const char *)q)-starts;
2531 endinpos = ((const char *)e)-starts;
2532 goto utf16Error;
2533 /* The remaining input chars are ignored if the callback
2534 chooses to skip the input */
2535 }
2536 ch = (q[ihi] << 8) | q[ilo];
2537
Tim Peters772747b2001-08-09 22:21:55 +00002538 q += 2;
2539
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 if (ch < 0xD800 || ch > 0xDFFF) {
2541 *p++ = ch;
2542 continue;
2543 }
2544
2545 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002546 if (q >= e) {
2547 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548 startinpos = (((const char *)q)-2)-starts;
2549 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002550 goto utf16Error;
2551 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002552 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002553 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2554 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002555 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002556#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002557 *p++ = ch;
2558 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002559#else
2560 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002561#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002562 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002563 }
2564 else {
2565 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 startinpos = (((const char *)q)-4)-starts;
2567 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002568 goto utf16Error;
2569 }
2570
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002572 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 startinpos = (((const char *)q)-2)-starts;
2574 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002575 /* Fall through to report the error */
2576
2577 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578 outpos = p-PyUnicode_AS_UNICODE(unicode);
2579 if (unicode_decode_call_errorhandler(
2580 errors, &errorHandler,
2581 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002582 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002584 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585 }
2586
2587 if (byteorder)
2588 *byteorder = bo;
2589
Walter Dörwald69652032004-09-07 20:24:22 +00002590 if (consumed)
2591 *consumed = (const char *)q-starts;
2592
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002594 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 goto onError;
2596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002597 Py_XDECREF(errorHandler);
2598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 return (PyObject *)unicode;
2600
2601onError:
2602 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002603 Py_XDECREF(errorHandler);
2604 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 return NULL;
2606}
2607
Tim Peters772747b2001-08-09 22:21:55 +00002608PyObject *
2609PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002610 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002611 const char *errors,
2612 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002614 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002615 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002616#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002617 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002618#else
2619 const int pairs = 0;
2620#endif
Tim Peters772747b2001-08-09 22:21:55 +00002621 /* Offsets from p for storing byte pairs in the right order. */
2622#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2623 int ihi = 1, ilo = 0;
2624#else
2625 int ihi = 0, ilo = 1;
2626#endif
2627
2628#define STORECHAR(CH) \
2629 do { \
2630 p[ihi] = ((CH) >> 8) & 0xff; \
2631 p[ilo] = (CH) & 0xff; \
2632 p += 2; \
2633 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002635#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002636 for (i = pairs = 0; i < size; i++)
2637 if (s[i] >= 0x10000)
2638 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002639#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002640 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002641 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 if (v == NULL)
2643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644
Walter Dörwald3cc34522007-05-04 10:48:27 +00002645 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002647 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002648 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002649 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002650
2651 if (byteorder == -1) {
2652 /* force LE */
2653 ihi = 1;
2654 ilo = 0;
2655 }
2656 else if (byteorder == 1) {
2657 /* force BE */
2658 ihi = 0;
2659 ilo = 1;
2660 }
2661
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002662 while (size-- > 0) {
2663 Py_UNICODE ch = *s++;
2664 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002665#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002666 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002667 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2668 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002670#endif
Tim Peters772747b2001-08-09 22:21:55 +00002671 STORECHAR(ch);
2672 if (ch2)
2673 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002674 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002675
2676 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002677 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002678 Py_DECREF(v);
2679 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002680#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681}
2682
2683PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2684{
2685 if (!PyUnicode_Check(unicode)) {
2686 PyErr_BadArgument();
2687 return NULL;
2688 }
2689 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2690 PyUnicode_GET_SIZE(unicode),
2691 NULL,
2692 0);
2693}
2694
2695/* --- Unicode Escape Codec ----------------------------------------------- */
2696
Fredrik Lundh06d12682001-01-24 07:59:11 +00002697static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002700 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 const char *errors)
2702{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002703 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002704 Py_ssize_t startinpos;
2705 Py_ssize_t endinpos;
2706 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 char* message;
2712 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 PyObject *errorHandler = NULL;
2714 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002715
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 /* Escaped strings will always be longer than the resulting
2717 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 length after conversion to the true value.
2719 (but if the error callback returns a long replacement string
2720 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 v = _PyUnicode_New(size);
2722 if (v == NULL)
2723 goto onError;
2724 if (size == 0)
2725 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002729
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 while (s < end) {
2731 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002732 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734
2735 /* Non-escape characters are interpreted as Unicode ordinals */
2736 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002737 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 continue;
2739 }
2740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 /* \ - Escapes */
2743 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002744 c = *s++;
2745 if (s > end)
2746 c = '\0'; /* Invalid after \ */
2747 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748
2749 /* \x escapes */
2750 case '\n': break;
2751 case '\\': *p++ = '\\'; break;
2752 case '\'': *p++ = '\''; break;
2753 case '\"': *p++ = '\"'; break;
2754 case 'b': *p++ = '\b'; break;
2755 case 'f': *p++ = '\014'; break; /* FF */
2756 case 't': *p++ = '\t'; break;
2757 case 'n': *p++ = '\n'; break;
2758 case 'r': *p++ = '\r'; break;
2759 case 'v': *p++ = '\013'; break; /* VT */
2760 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2761
2762 /* \OOO (octal) escapes */
2763 case '0': case '1': case '2': case '3':
2764 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002765 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002766 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002767 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002768 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002769 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002771 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 break;
2773
Fredrik Lundhccc74732001-02-18 22:13:49 +00002774 /* hex escapes */
2775 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002777 digits = 2;
2778 message = "truncated \\xXX escape";
2779 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780
Fredrik Lundhccc74732001-02-18 22:13:49 +00002781 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002783 digits = 4;
2784 message = "truncated \\uXXXX escape";
2785 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786
Fredrik Lundhccc74732001-02-18 22:13:49 +00002787 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002788 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002789 digits = 8;
2790 message = "truncated \\UXXXXXXXX escape";
2791 hexescape:
2792 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 outpos = p-PyUnicode_AS_UNICODE(v);
2794 if (s+digits>end) {
2795 endinpos = size;
2796 if (unicode_decode_call_errorhandler(
2797 errors, &errorHandler,
2798 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002799 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 (PyObject **)&v, &outpos, &p))
2801 goto onError;
2802 goto nextByte;
2803 }
2804 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002805 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002806 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 endinpos = (s+i+1)-starts;
2808 if (unicode_decode_call_errorhandler(
2809 errors, &errorHandler,
2810 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002811 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002813 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002815 }
2816 chr = (chr<<4) & ~0xF;
2817 if (c >= '0' && c <= '9')
2818 chr += c - '0';
2819 else if (c >= 'a' && c <= 'f')
2820 chr += 10 + c - 'a';
2821 else
2822 chr += 10 + c - 'A';
2823 }
2824 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002825 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 /* _decoding_error will have already written into the
2827 target buffer. */
2828 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002829 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002830 /* when we get here, chr is a 32-bit unicode character */
2831 if (chr <= 0xffff)
2832 /* UCS-2 character */
2833 *p++ = (Py_UNICODE) chr;
2834 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002835 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002836 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002837#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002838 *p++ = chr;
2839#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002840 chr -= 0x10000L;
2841 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002842 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002843#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002844 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845 endinpos = s-starts;
2846 outpos = p-PyUnicode_AS_UNICODE(v);
2847 if (unicode_decode_call_errorhandler(
2848 errors, &errorHandler,
2849 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002850 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002852 goto onError;
2853 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002854 break;
2855
2856 /* \N{name} */
2857 case 'N':
2858 message = "malformed \\N character escape";
2859 if (ucnhash_CAPI == NULL) {
2860 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002861 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002862 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002863 if (m == NULL)
2864 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002865 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002866 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002867 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002868 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002869 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002870 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002871 if (ucnhash_CAPI == NULL)
2872 goto ucnhashError;
2873 }
2874 if (*s == '{') {
2875 const char *start = s+1;
2876 /* look for the closing brace */
2877 while (*s != '}' && s < end)
2878 s++;
2879 if (s > start && s < end && *s == '}') {
2880 /* found a name. look it up in the unicode database */
2881 message = "unknown Unicode character name";
2882 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002883 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002884 goto store;
2885 }
2886 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 endinpos = s-starts;
2888 outpos = p-PyUnicode_AS_UNICODE(v);
2889 if (unicode_decode_call_errorhandler(
2890 errors, &errorHandler,
2891 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002892 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002893 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002894 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002895 break;
2896
2897 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002898 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899 message = "\\ at end of string";
2900 s--;
2901 endinpos = s-starts;
2902 outpos = p-PyUnicode_AS_UNICODE(v);
2903 if (unicode_decode_call_errorhandler(
2904 errors, &errorHandler,
2905 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002906 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002908 goto onError;
2909 }
2910 else {
2911 *p++ = '\\';
2912 *p++ = (unsigned char)s[-1];
2913 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002914 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916 nextByte:
2917 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002919 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002921 Py_XDECREF(errorHandler);
2922 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002924
Fredrik Lundhccc74732001-02-18 22:13:49 +00002925ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002926 PyErr_SetString(
2927 PyExc_UnicodeError,
2928 "\\N escapes not supported (can't load unicodedata module)"
2929 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002930 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002931 Py_XDECREF(errorHandler);
2932 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002933 return NULL;
2934
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 Py_XDECREF(errorHandler);
2938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 return NULL;
2940}
2941
2942/* Return a Unicode-Escape string version of the Unicode object.
2943
2944 If quotes is true, the string is enclosed in u"" or u'' quotes as
2945 appropriate.
2946
2947*/
2948
Thomas Wouters477c8d52006-05-27 19:21:47 +00002949Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2950 Py_ssize_t size,
2951 Py_UNICODE ch)
2952{
2953 /* like wcschr, but doesn't stop at NULL characters */
2954
2955 while (size-- > 0) {
2956 if (*s == ch)
2957 return s;
2958 s++;
2959 }
2960
2961 return NULL;
2962}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002963
Walter Dörwald79e913e2007-05-12 11:08:06 +00002964static const char *hexdigits = "0123456789abcdef";
2965
2966PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2967 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002969 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971
Thomas Wouters89f507f2006-12-13 04:49:30 +00002972 /* XXX(nnorwitz): rather than over-allocating, it would be
2973 better to choose a different scheme. Perhaps scan the
2974 first N-chars of the string and allocate based on that size.
2975 */
2976 /* Initial allocation is based on the longest-possible unichr
2977 escape.
2978
2979 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2980 unichr, so in this case it's the longest unichr escape. In
2981 narrow (UTF-16) builds this is five chars per source unichr
2982 since there are two unichrs in the surrogate pair, so in narrow
2983 (UTF-16) builds it's not the longest unichr escape.
2984
2985 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2986 so in the narrow (UTF-16) build case it's the longest unichr
2987 escape.
2988 */
2989
Walter Dörwald79e913e2007-05-12 11:08:06 +00002990 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002991#ifdef Py_UNICODE_WIDE
2992 + 10*size
2993#else
2994 + 6*size
2995#endif
2996 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 if (repr == NULL)
2998 return NULL;
2999
Walter Dörwald79e913e2007-05-12 11:08:06 +00003000 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 while (size-- > 0) {
3003 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003004
Walter Dörwald79e913e2007-05-12 11:08:06 +00003005 /* Escape backslashes */
3006 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 *p++ = '\\';
3008 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003009 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003010 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003011
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003012#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003013 /* Map 21-bit characters to '\U00xxxxxx' */
3014 else if (ch >= 0x10000) {
3015 *p++ = '\\';
3016 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003017 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3018 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3019 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3020 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3021 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3022 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3023 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3024 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003025 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003026 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003027#else
3028 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003029 else if (ch >= 0xD800 && ch < 0xDC00) {
3030 Py_UNICODE ch2;
3031 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003032
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003033 ch2 = *s++;
3034 size--;
3035 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3036 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3037 *p++ = '\\';
3038 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003039 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3040 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3041 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3042 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3043 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3044 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3045 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3046 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003047 continue;
3048 }
3049 /* Fall through: isolated surrogates are copied as-is */
3050 s--;
3051 size++;
3052 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003053#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003054
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003056 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 *p++ = '\\';
3058 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003059 *p++ = hexdigits[(ch >> 12) & 0x000F];
3060 *p++ = hexdigits[(ch >> 8) & 0x000F];
3061 *p++ = hexdigits[(ch >> 4) & 0x000F];
3062 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003064
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003065 /* Map special whitespace to '\t', \n', '\r' */
3066 else if (ch == '\t') {
3067 *p++ = '\\';
3068 *p++ = 't';
3069 }
3070 else if (ch == '\n') {
3071 *p++ = '\\';
3072 *p++ = 'n';
3073 }
3074 else if (ch == '\r') {
3075 *p++ = '\\';
3076 *p++ = 'r';
3077 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003078
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003079 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003080 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003082 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003083 *p++ = hexdigits[(ch >> 4) & 0x000F];
3084 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003085 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003086
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 /* Copy everything else as-is */
3088 else
3089 *p++ = (char) ch;
3090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091
Guido van Rossum98297ee2007-11-06 21:34:58 +00003092 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3093 p - PyBytes_AS_STRING(repr));
3094 Py_DECREF(repr);
3095 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096}
3097
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3099{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003100 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 if (!PyUnicode_Check(unicode)) {
3102 PyErr_BadArgument();
3103 return NULL;
3104 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003105 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3106 PyUnicode_GET_SIZE(unicode));
3107
3108 if (!s)
3109 return NULL;
3110 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3111 PyBytes_GET_SIZE(s));
3112 Py_DECREF(s);
3113 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114}
3115
3116/* --- Raw Unicode Escape Codec ------------------------------------------- */
3117
3118PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003119 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 const char *errors)
3121{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003123 Py_ssize_t startinpos;
3124 Py_ssize_t endinpos;
3125 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 const char *end;
3129 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 PyObject *errorHandler = NULL;
3131 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003132
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 /* Escaped strings will always be longer than the resulting
3134 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003135 length after conversion to the true value. (But decoding error
3136 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 v = _PyUnicode_New(size);
3138 if (v == NULL)
3139 goto onError;
3140 if (size == 0)
3141 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 end = s + size;
3144 while (s < end) {
3145 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003146 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003148 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149
3150 /* Non-escape characters are interpreted as Unicode ordinals */
3151 if (*s != '\\') {
3152 *p++ = (unsigned char)*s++;
3153 continue;
3154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003155 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156
3157 /* \u-escapes are only interpreted iff the number of leading
3158 backslashes if odd */
3159 bs = s;
3160 for (;s < end;) {
3161 if (*s != '\\')
3162 break;
3163 *p++ = (unsigned char)*s++;
3164 }
3165 if (((s - bs) & 1) == 0 ||
3166 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003167 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 continue;
3169 }
3170 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003171 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 s++;
3173
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003174 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003176 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003178 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 endinpos = s-starts;
3180 if (unicode_decode_call_errorhandler(
3181 errors, &errorHandler,
3182 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003183 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003184 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187 }
3188 x = (x<<4) & ~0xF;
3189 if (c >= '0' && c <= '9')
3190 x += c - '0';
3191 else if (c >= 'a' && c <= 'f')
3192 x += 10 + c - 'a';
3193 else
3194 x += 10 + c - 'A';
3195 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003196#ifndef Py_UNICODE_WIDE
3197 if (x > 0x10000) {
3198 if (unicode_decode_call_errorhandler(
3199 errors, &errorHandler,
3200 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003201 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003202 (PyObject **)&v, &outpos, &p))
3203 goto onError;
3204 }
3205#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 *p++ = x;
3207 nextByte:
3208 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003210 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003211 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003212 Py_XDECREF(errorHandler);
3213 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003215
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 onError:
3217 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 Py_XDECREF(errorHandler);
3219 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 return NULL;
3221}
3222
3223PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003224 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003226 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 char *p;
3228 char *q;
3229
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003230#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003231 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003232#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003233 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003234#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 if (repr == NULL)
3236 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003237 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003238 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239
Walter Dörwald711005d2007-05-12 12:03:26 +00003240 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 while (size-- > 0) {
3242 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003243#ifdef Py_UNICODE_WIDE
3244 /* Map 32-bit characters to '\Uxxxxxxxx' */
3245 if (ch >= 0x10000) {
3246 *p++ = '\\';
3247 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003248 *p++ = hexdigits[(ch >> 28) & 0xf];
3249 *p++ = hexdigits[(ch >> 24) & 0xf];
3250 *p++ = hexdigits[(ch >> 20) & 0xf];
3251 *p++ = hexdigits[(ch >> 16) & 0xf];
3252 *p++ = hexdigits[(ch >> 12) & 0xf];
3253 *p++ = hexdigits[(ch >> 8) & 0xf];
3254 *p++ = hexdigits[(ch >> 4) & 0xf];
3255 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003256 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003257 else
3258#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 /* Map 16-bit characters to '\uxxxx' */
3260 if (ch >= 256) {
3261 *p++ = '\\';
3262 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003263 *p++ = hexdigits[(ch >> 12) & 0xf];
3264 *p++ = hexdigits[(ch >> 8) & 0xf];
3265 *p++ = hexdigits[(ch >> 4) & 0xf];
3266 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 }
3268 /* Copy everything else as-is */
3269 else
3270 *p++ = (char) ch;
3271 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003272 size = p - q;
3273
3274 done:
3275 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3276 Py_DECREF(repr);
3277 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278}
3279
3280PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3281{
Walter Dörwald711005d2007-05-12 12:03:26 +00003282 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003284 PyErr_BadArgument();
3285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003287 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3288 PyUnicode_GET_SIZE(unicode));
3289
3290 if (!s)
3291 return NULL;
3292 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3293 PyBytes_GET_SIZE(s));
3294 Py_DECREF(s);
3295 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296}
3297
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003298/* --- Unicode Internal Codec ------------------------------------------- */
3299
3300PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003301 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003302 const char *errors)
3303{
3304 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003305 Py_ssize_t startinpos;
3306 Py_ssize_t endinpos;
3307 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003308 PyUnicodeObject *v;
3309 Py_UNICODE *p;
3310 const char *end;
3311 const char *reason;
3312 PyObject *errorHandler = NULL;
3313 PyObject *exc = NULL;
3314
Neal Norwitzd43069c2006-01-08 01:12:10 +00003315#ifdef Py_UNICODE_WIDE
3316 Py_UNICODE unimax = PyUnicode_GetMax();
3317#endif
3318
Thomas Wouters89f507f2006-12-13 04:49:30 +00003319 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003320 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3321 if (v == NULL)
3322 goto onError;
3323 if (PyUnicode_GetSize((PyObject *)v) == 0)
3324 return (PyObject *)v;
3325 p = PyUnicode_AS_UNICODE(v);
3326 end = s + size;
3327
3328 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003329 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003330 /* We have to sanity check the raw data, otherwise doom looms for
3331 some malformed UCS-4 data. */
3332 if (
3333 #ifdef Py_UNICODE_WIDE
3334 *p > unimax || *p < 0 ||
3335 #endif
3336 end-s < Py_UNICODE_SIZE
3337 )
3338 {
3339 startinpos = s - starts;
3340 if (end-s < Py_UNICODE_SIZE) {
3341 endinpos = end-starts;
3342 reason = "truncated input";
3343 }
3344 else {
3345 endinpos = s - starts + Py_UNICODE_SIZE;
3346 reason = "illegal code point (> 0x10FFFF)";
3347 }
3348 outpos = p - PyUnicode_AS_UNICODE(v);
3349 if (unicode_decode_call_errorhandler(
3350 errors, &errorHandler,
3351 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003352 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003353 (PyObject **)&v, &outpos, &p)) {
3354 goto onError;
3355 }
3356 }
3357 else {
3358 p++;
3359 s += Py_UNICODE_SIZE;
3360 }
3361 }
3362
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003363 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003364 goto onError;
3365 Py_XDECREF(errorHandler);
3366 Py_XDECREF(exc);
3367 return (PyObject *)v;
3368
3369 onError:
3370 Py_XDECREF(v);
3371 Py_XDECREF(errorHandler);
3372 Py_XDECREF(exc);
3373 return NULL;
3374}
3375
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376/* --- Latin-1 Codec ------------------------------------------------------ */
3377
3378PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003379 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 const char *errors)
3381{
3382 PyUnicodeObject *v;
3383 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003384
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003386 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003387 Py_UNICODE r = *(unsigned char*)s;
3388 return PyUnicode_FromUnicode(&r, 1);
3389 }
3390
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 v = _PyUnicode_New(size);
3392 if (v == NULL)
3393 goto onError;
3394 if (size == 0)
3395 return (PyObject *)v;
3396 p = PyUnicode_AS_UNICODE(v);
3397 while (size-- > 0)
3398 *p++ = (unsigned char)*s++;
3399 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003400
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401 onError:
3402 Py_XDECREF(v);
3403 return NULL;
3404}
3405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003406/* create or adjust a UnicodeEncodeError */
3407static void make_encode_exception(PyObject **exceptionObject,
3408 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003409 const Py_UNICODE *unicode, Py_ssize_t size,
3410 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 if (*exceptionObject == NULL) {
3414 *exceptionObject = PyUnicodeEncodeError_Create(
3415 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 }
3417 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3419 goto onError;
3420 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3421 goto onError;
3422 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3423 goto onError;
3424 return;
3425 onError:
3426 Py_DECREF(*exceptionObject);
3427 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 }
3429}
3430
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431/* raises a UnicodeEncodeError */
3432static void raise_encode_exception(PyObject **exceptionObject,
3433 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003434 const Py_UNICODE *unicode, Py_ssize_t size,
3435 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 const char *reason)
3437{
3438 make_encode_exception(exceptionObject,
3439 encoding, unicode, size, startpos, endpos, reason);
3440 if (*exceptionObject != NULL)
3441 PyCodec_StrictErrors(*exceptionObject);
3442}
3443
3444/* error handling callback helper:
3445 build arguments, call the callback and check the arguments,
3446 put the result into newpos and return the replacement string, which
3447 has to be freed by the caller */
3448static PyObject *unicode_encode_call_errorhandler(const char *errors,
3449 PyObject **errorHandler,
3450 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003451 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3452 Py_ssize_t startpos, Py_ssize_t endpos,
3453 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003455 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456
3457 PyObject *restuple;
3458 PyObject *resunicode;
3459
3460 if (*errorHandler == NULL) {
3461 *errorHandler = PyCodec_LookupError(errors);
3462 if (*errorHandler == NULL)
3463 return NULL;
3464 }
3465
3466 make_encode_exception(exceptionObject,
3467 encoding, unicode, size, startpos, endpos, reason);
3468 if (*exceptionObject == NULL)
3469 return NULL;
3470
3471 restuple = PyObject_CallFunctionObjArgs(
3472 *errorHandler, *exceptionObject, NULL);
3473 if (restuple == NULL)
3474 return NULL;
3475 if (!PyTuple_Check(restuple)) {
3476 PyErr_Format(PyExc_TypeError, &argparse[4]);
3477 Py_DECREF(restuple);
3478 return NULL;
3479 }
3480 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3481 &resunicode, newpos)) {
3482 Py_DECREF(restuple);
3483 return NULL;
3484 }
3485 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003486 *newpos = size+*newpos;
3487 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003488 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003489 Py_DECREF(restuple);
3490 return NULL;
3491 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 Py_INCREF(resunicode);
3493 Py_DECREF(restuple);
3494 return resunicode;
3495}
3496
3497static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003498 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499 const char *errors,
3500 int limit)
3501{
3502 /* output object */
3503 PyObject *res;
3504 /* pointers to the beginning and end+1 of input */
3505 const Py_UNICODE *startp = p;
3506 const Py_UNICODE *endp = p + size;
3507 /* pointer to the beginning of the unencodable characters */
3508 /* const Py_UNICODE *badp = NULL; */
3509 /* pointer into the output */
3510 char *str;
3511 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003512 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003513 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3514 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 PyObject *errorHandler = NULL;
3516 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003517 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 /* the following variable is used for caching string comparisons
3519 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3520 int known_errorHandler = -1;
3521
3522 /* allocate enough for a simple encoding without
3523 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003524 if (size == 0)
3525 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003526 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003528 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003529 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 ressize = size;
3531
3532 while (p<endp) {
3533 Py_UNICODE c = *p;
3534
3535 /* can we encode this? */
3536 if (c<limit) {
3537 /* no overflow check, because we know that the space is enough */
3538 *str++ = (char)c;
3539 ++p;
3540 }
3541 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003542 Py_ssize_t unicodepos = p-startp;
3543 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003545 Py_ssize_t repsize;
3546 Py_ssize_t newpos;
3547 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 Py_UNICODE *uni2;
3549 /* startpos for collecting unencodable chars */
3550 const Py_UNICODE *collstart = p;
3551 const Py_UNICODE *collend = p;
3552 /* find all unecodable characters */
3553 while ((collend < endp) && ((*collend)>=limit))
3554 ++collend;
3555 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3556 if (known_errorHandler==-1) {
3557 if ((errors==NULL) || (!strcmp(errors, "strict")))
3558 known_errorHandler = 1;
3559 else if (!strcmp(errors, "replace"))
3560 known_errorHandler = 2;
3561 else if (!strcmp(errors, "ignore"))
3562 known_errorHandler = 3;
3563 else if (!strcmp(errors, "xmlcharrefreplace"))
3564 known_errorHandler = 4;
3565 else
3566 known_errorHandler = 0;
3567 }
3568 switch (known_errorHandler) {
3569 case 1: /* strict */
3570 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3571 goto onError;
3572 case 2: /* replace */
3573 while (collstart++<collend)
3574 *str++ = '?'; /* fall through */
3575 case 3: /* ignore */
3576 p = collend;
3577 break;
3578 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003579 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 /* determine replacement size (temporarily (mis)uses p) */
3581 for (p = collstart, repsize = 0; p < collend; ++p) {
3582 if (*p<10)
3583 repsize += 2+1+1;
3584 else if (*p<100)
3585 repsize += 2+2+1;
3586 else if (*p<1000)
3587 repsize += 2+3+1;
3588 else if (*p<10000)
3589 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003590#ifndef Py_UNICODE_WIDE
3591 else
3592 repsize += 2+5+1;
3593#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 else if (*p<100000)
3595 repsize += 2+5+1;
3596 else if (*p<1000000)
3597 repsize += 2+6+1;
3598 else
3599 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003600#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 }
3602 requiredsize = respos+repsize+(endp-collend);
3603 if (requiredsize > ressize) {
3604 if (requiredsize<2*ressize)
3605 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003606 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003608 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 ressize = requiredsize;
3610 }
3611 /* generate replacement (temporarily (mis)uses p) */
3612 for (p = collstart; p < collend; ++p) {
3613 str += sprintf(str, "&#%d;", (int)*p);
3614 }
3615 p = collend;
3616 break;
3617 default:
3618 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3619 encoding, reason, startp, size, &exc,
3620 collstart-startp, collend-startp, &newpos);
3621 if (repunicode == NULL)
3622 goto onError;
3623 /* need more space? (at least enough for what we
3624 have+the replacement+the rest of the string, so
3625 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003626 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 repsize = PyUnicode_GET_SIZE(repunicode);
3628 requiredsize = respos+repsize+(endp-collend);
3629 if (requiredsize > ressize) {
3630 if (requiredsize<2*ressize)
3631 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003632 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 Py_DECREF(repunicode);
3634 goto onError;
3635 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003636 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 ressize = requiredsize;
3638 }
3639 /* check if there is anything unencodable in the replacement
3640 and copy it to the output */
3641 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3642 c = *uni2;
3643 if (c >= limit) {
3644 raise_encode_exception(&exc, encoding, startp, size,
3645 unicodepos, unicodepos+1, reason);
3646 Py_DECREF(repunicode);
3647 goto onError;
3648 }
3649 *str = (char)c;
3650 }
3651 p = startp + newpos;
3652 Py_DECREF(repunicode);
3653 }
3654 }
3655 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003656 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3657 str - PyBytes_AS_STRING(res));
3658 onError:
3659 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_XDECREF(errorHandler);
3661 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003662 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663}
3664
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003666 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667 const char *errors)
3668{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670}
3671
3672PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3673{
3674 if (!PyUnicode_Check(unicode)) {
3675 PyErr_BadArgument();
3676 return NULL;
3677 }
3678 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3679 PyUnicode_GET_SIZE(unicode),
3680 NULL);
3681}
3682
3683/* --- 7-bit ASCII Codec -------------------------------------------------- */
3684
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003686 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 const char *errors)
3688{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 PyUnicodeObject *v;
3691 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003692 Py_ssize_t startinpos;
3693 Py_ssize_t endinpos;
3694 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 const char *e;
3696 PyObject *errorHandler = NULL;
3697 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003698
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003700 if (size == 1 && *(unsigned char*)s < 128) {
3701 Py_UNICODE r = *(unsigned char*)s;
3702 return PyUnicode_FromUnicode(&r, 1);
3703 }
Tim Petersced69f82003-09-16 20:30:58 +00003704
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 v = _PyUnicode_New(size);
3706 if (v == NULL)
3707 goto onError;
3708 if (size == 0)
3709 return (PyObject *)v;
3710 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 e = s + size;
3712 while (s < e) {
3713 register unsigned char c = (unsigned char)*s;
3714 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 ++s;
3717 }
3718 else {
3719 startinpos = s-starts;
3720 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003721 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 if (unicode_decode_call_errorhandler(
3723 errors, &errorHandler,
3724 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003725 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003730 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003731 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003732 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 Py_XDECREF(errorHandler);
3734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003736
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 onError:
3738 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 Py_XDECREF(errorHandler);
3740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 return NULL;
3742}
3743
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 const char *errors)
3747{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749}
3750
3751PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3752{
3753 if (!PyUnicode_Check(unicode)) {
3754 PyErr_BadArgument();
3755 return NULL;
3756 }
3757 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3758 PyUnicode_GET_SIZE(unicode),
3759 NULL);
3760}
3761
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003762#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003763
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003764/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003765
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003766#if SIZEOF_INT < SIZEOF_SSIZE_T
3767#define NEED_RETRY
3768#endif
3769
3770/* XXX This code is limited to "true" double-byte encodings, as
3771 a) it assumes an incomplete character consists of a single byte, and
3772 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3773 encodings, see IsDBCSLeadByteEx documentation. */
3774
3775static int is_dbcs_lead_byte(const char *s, int offset)
3776{
3777 const char *curr = s + offset;
3778
3779 if (IsDBCSLeadByte(*curr)) {
3780 const char *prev = CharPrev(s, curr);
3781 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3782 }
3783 return 0;
3784}
3785
3786/*
3787 * Decode MBCS string into unicode object. If 'final' is set, converts
3788 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3789 */
3790static int decode_mbcs(PyUnicodeObject **v,
3791 const char *s, /* MBCS string */
3792 int size, /* sizeof MBCS string */
3793 int final)
3794{
3795 Py_UNICODE *p;
3796 Py_ssize_t n = 0;
3797 int usize = 0;
3798
3799 assert(size >= 0);
3800
3801 /* Skip trailing lead-byte unless 'final' is set */
3802 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3803 --size;
3804
3805 /* First get the size of the result */
3806 if (size > 0) {
3807 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3808 if (usize == 0) {
3809 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3810 return -1;
3811 }
3812 }
3813
3814 if (*v == NULL) {
3815 /* Create unicode object */
3816 *v = _PyUnicode_New(usize);
3817 if (*v == NULL)
3818 return -1;
3819 }
3820 else {
3821 /* Extend unicode object */
3822 n = PyUnicode_GET_SIZE(*v);
3823 if (_PyUnicode_Resize(v, n + usize) < 0)
3824 return -1;
3825 }
3826
3827 /* Do the conversion */
3828 if (size > 0) {
3829 p = PyUnicode_AS_UNICODE(*v) + n;
3830 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3831 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3832 return -1;
3833 }
3834 }
3835
3836 return size;
3837}
3838
3839PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3840 Py_ssize_t size,
3841 const char *errors,
3842 Py_ssize_t *consumed)
3843{
3844 PyUnicodeObject *v = NULL;
3845 int done;
3846
3847 if (consumed)
3848 *consumed = 0;
3849
3850#ifdef NEED_RETRY
3851 retry:
3852 if (size > INT_MAX)
3853 done = decode_mbcs(&v, s, INT_MAX, 0);
3854 else
3855#endif
3856 done = decode_mbcs(&v, s, (int)size, !consumed);
3857
3858 if (done < 0) {
3859 Py_XDECREF(v);
3860 return NULL;
3861 }
3862
3863 if (consumed)
3864 *consumed += done;
3865
3866#ifdef NEED_RETRY
3867 if (size > INT_MAX) {
3868 s += done;
3869 size -= done;
3870 goto retry;
3871 }
3872#endif
3873
3874 return (PyObject *)v;
3875}
3876
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003877PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003878 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003879 const char *errors)
3880{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003881 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3882}
3883
3884/*
3885 * Convert unicode into string object (MBCS).
3886 * Returns 0 if succeed, -1 otherwise.
3887 */
3888static int encode_mbcs(PyObject **repr,
3889 const Py_UNICODE *p, /* unicode */
3890 int size) /* size of unicode */
3891{
3892 int mbcssize = 0;
3893 Py_ssize_t n = 0;
3894
3895 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003896
3897 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003898 if (size > 0) {
3899 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3900 if (mbcssize == 0) {
3901 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3902 return -1;
3903 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003904 }
3905
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003906 if (*repr == NULL) {
3907 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003908 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003909 if (*repr == NULL)
3910 return -1;
3911 }
3912 else {
3913 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003914 n = PyString_Size(*repr);
3915 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003916 return -1;
3917 }
3918
3919 /* Do the conversion */
3920 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003921 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003922 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3923 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3924 return -1;
3925 }
3926 }
3927
3928 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003929}
3930
3931PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003933 const char *errors)
3934{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003935 PyObject *repr = NULL;
3936 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003937
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003938#ifdef NEED_RETRY
3939 retry:
3940 if (size > INT_MAX)
3941 ret = encode_mbcs(&repr, p, INT_MAX);
3942 else
3943#endif
3944 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003945
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003946 if (ret < 0) {
3947 Py_XDECREF(repr);
3948 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003949 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003950
3951#ifdef NEED_RETRY
3952 if (size > INT_MAX) {
3953 p += INT_MAX;
3954 size -= INT_MAX;
3955 goto retry;
3956 }
3957#endif
3958
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003959 return repr;
3960}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003961
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003962PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3963{
3964 if (!PyUnicode_Check(unicode)) {
3965 PyErr_BadArgument();
3966 return NULL;
3967 }
3968 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3969 PyUnicode_GET_SIZE(unicode),
3970 NULL);
3971}
3972
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003973#undef NEED_RETRY
3974
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003975#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003976
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977/* --- Character Mapping Codec -------------------------------------------- */
3978
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 PyObject *mapping,
3982 const char *errors)
3983{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003985 Py_ssize_t startinpos;
3986 Py_ssize_t endinpos;
3987 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 PyUnicodeObject *v;
3990 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003991 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 PyObject *errorHandler = NULL;
3993 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003994 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003995 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003996
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 /* Default to Latin-1 */
3998 if (mapping == NULL)
3999 return PyUnicode_DecodeLatin1(s, size, errors);
4000
4001 v = _PyUnicode_New(size);
4002 if (v == NULL)
4003 goto onError;
4004 if (size == 0)
4005 return (PyObject *)v;
4006 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004008 if (PyUnicode_CheckExact(mapping)) {
4009 mapstring = PyUnicode_AS_UNICODE(mapping);
4010 maplen = PyUnicode_GET_SIZE(mapping);
4011 while (s < e) {
4012 unsigned char ch = *s;
4013 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004015 if (ch < maplen)
4016 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004018 if (x == 0xfffe) {
4019 /* undefined mapping */
4020 outpos = p-PyUnicode_AS_UNICODE(v);
4021 startinpos = s-starts;
4022 endinpos = startinpos+1;
4023 if (unicode_decode_call_errorhandler(
4024 errors, &errorHandler,
4025 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004026 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004027 (PyObject **)&v, &outpos, &p)) {
4028 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004029 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004030 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004031 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004032 *p++ = x;
4033 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004035 }
4036 else {
4037 while (s < e) {
4038 unsigned char ch = *s;
4039 PyObject *w, *x;
4040
4041 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004042 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004043 if (w == NULL)
4044 goto onError;
4045 x = PyObject_GetItem(mapping, w);
4046 Py_DECREF(w);
4047 if (x == NULL) {
4048 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4049 /* No mapping found means: mapping is undefined. */
4050 PyErr_Clear();
4051 x = Py_None;
4052 Py_INCREF(x);
4053 } else
4054 goto onError;
4055 }
4056
4057 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004058 if (PyLong_Check(x)) {
4059 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004060 if (value < 0 || value > 65535) {
4061 PyErr_SetString(PyExc_TypeError,
4062 "character mapping must be in range(65536)");
4063 Py_DECREF(x);
4064 goto onError;
4065 }
4066 *p++ = (Py_UNICODE)value;
4067 }
4068 else if (x == Py_None) {
4069 /* undefined mapping */
4070 outpos = p-PyUnicode_AS_UNICODE(v);
4071 startinpos = s-starts;
4072 endinpos = startinpos+1;
4073 if (unicode_decode_call_errorhandler(
4074 errors, &errorHandler,
4075 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004076 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004077 (PyObject **)&v, &outpos, &p)) {
4078 Py_DECREF(x);
4079 goto onError;
4080 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004081 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004082 continue;
4083 }
4084 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004085 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004086
4087 if (targetsize == 1)
4088 /* 1-1 mapping */
4089 *p++ = *PyUnicode_AS_UNICODE(x);
4090
4091 else if (targetsize > 1) {
4092 /* 1-n mapping */
4093 if (targetsize > extrachars) {
4094 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004095 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4096 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004097 (targetsize << 2);
4098 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004099 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004100 if (_PyUnicode_Resize(&v,
4101 PyUnicode_GET_SIZE(v) + needed) < 0) {
4102 Py_DECREF(x);
4103 goto onError;
4104 }
4105 p = PyUnicode_AS_UNICODE(v) + oldpos;
4106 }
4107 Py_UNICODE_COPY(p,
4108 PyUnicode_AS_UNICODE(x),
4109 targetsize);
4110 p += targetsize;
4111 extrachars -= targetsize;
4112 }
4113 /* 1-0 mapping: skip the character */
4114 }
4115 else {
4116 /* wrong return value */
4117 PyErr_SetString(PyExc_TypeError,
4118 "character mapping must return integer, None or unicode");
4119 Py_DECREF(x);
4120 goto onError;
4121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004123 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 }
4126 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004127 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 Py_XDECREF(errorHandler);
4130 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004132
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 Py_XDECREF(errorHandler);
4135 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 Py_XDECREF(v);
4137 return NULL;
4138}
4139
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004140/* Charmap encoding: the lookup table */
4141
4142struct encoding_map{
4143 PyObject_HEAD
4144 unsigned char level1[32];
4145 int count2, count3;
4146 unsigned char level23[1];
4147};
4148
4149static PyObject*
4150encoding_map_size(PyObject *obj, PyObject* args)
4151{
4152 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004153 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004154 128*map->count3);
4155}
4156
4157static PyMethodDef encoding_map_methods[] = {
4158 {"size", encoding_map_size, METH_NOARGS,
4159 PyDoc_STR("Return the size (in bytes) of this object") },
4160 { 0 }
4161};
4162
4163static void
4164encoding_map_dealloc(PyObject* o)
4165{
4166 PyObject_FREE(o);
4167}
4168
4169static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004170 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004171 "EncodingMap", /*tp_name*/
4172 sizeof(struct encoding_map), /*tp_basicsize*/
4173 0, /*tp_itemsize*/
4174 /* methods */
4175 encoding_map_dealloc, /*tp_dealloc*/
4176 0, /*tp_print*/
4177 0, /*tp_getattr*/
4178 0, /*tp_setattr*/
4179 0, /*tp_compare*/
4180 0, /*tp_repr*/
4181 0, /*tp_as_number*/
4182 0, /*tp_as_sequence*/
4183 0, /*tp_as_mapping*/
4184 0, /*tp_hash*/
4185 0, /*tp_call*/
4186 0, /*tp_str*/
4187 0, /*tp_getattro*/
4188 0, /*tp_setattro*/
4189 0, /*tp_as_buffer*/
4190 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4191 0, /*tp_doc*/
4192 0, /*tp_traverse*/
4193 0, /*tp_clear*/
4194 0, /*tp_richcompare*/
4195 0, /*tp_weaklistoffset*/
4196 0, /*tp_iter*/
4197 0, /*tp_iternext*/
4198 encoding_map_methods, /*tp_methods*/
4199 0, /*tp_members*/
4200 0, /*tp_getset*/
4201 0, /*tp_base*/
4202 0, /*tp_dict*/
4203 0, /*tp_descr_get*/
4204 0, /*tp_descr_set*/
4205 0, /*tp_dictoffset*/
4206 0, /*tp_init*/
4207 0, /*tp_alloc*/
4208 0, /*tp_new*/
4209 0, /*tp_free*/
4210 0, /*tp_is_gc*/
4211};
4212
4213PyObject*
4214PyUnicode_BuildEncodingMap(PyObject* string)
4215{
4216 Py_UNICODE *decode;
4217 PyObject *result;
4218 struct encoding_map *mresult;
4219 int i;
4220 int need_dict = 0;
4221 unsigned char level1[32];
4222 unsigned char level2[512];
4223 unsigned char *mlevel1, *mlevel2, *mlevel3;
4224 int count2 = 0, count3 = 0;
4225
4226 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4227 PyErr_BadArgument();
4228 return NULL;
4229 }
4230 decode = PyUnicode_AS_UNICODE(string);
4231 memset(level1, 0xFF, sizeof level1);
4232 memset(level2, 0xFF, sizeof level2);
4233
4234 /* If there isn't a one-to-one mapping of NULL to \0,
4235 or if there are non-BMP characters, we need to use
4236 a mapping dictionary. */
4237 if (decode[0] != 0)
4238 need_dict = 1;
4239 for (i = 1; i < 256; i++) {
4240 int l1, l2;
4241 if (decode[i] == 0
4242 #ifdef Py_UNICODE_WIDE
4243 || decode[i] > 0xFFFF
4244 #endif
4245 ) {
4246 need_dict = 1;
4247 break;
4248 }
4249 if (decode[i] == 0xFFFE)
4250 /* unmapped character */
4251 continue;
4252 l1 = decode[i] >> 11;
4253 l2 = decode[i] >> 7;
4254 if (level1[l1] == 0xFF)
4255 level1[l1] = count2++;
4256 if (level2[l2] == 0xFF)
4257 level2[l2] = count3++;
4258 }
4259
4260 if (count2 >= 0xFF || count3 >= 0xFF)
4261 need_dict = 1;
4262
4263 if (need_dict) {
4264 PyObject *result = PyDict_New();
4265 PyObject *key, *value;
4266 if (!result)
4267 return NULL;
4268 for (i = 0; i < 256; i++) {
4269 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004270 key = PyLong_FromLong(decode[i]);
4271 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004272 if (!key || !value)
4273 goto failed1;
4274 if (PyDict_SetItem(result, key, value) == -1)
4275 goto failed1;
4276 Py_DECREF(key);
4277 Py_DECREF(value);
4278 }
4279 return result;
4280 failed1:
4281 Py_XDECREF(key);
4282 Py_XDECREF(value);
4283 Py_DECREF(result);
4284 return NULL;
4285 }
4286
4287 /* Create a three-level trie */
4288 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4289 16*count2 + 128*count3 - 1);
4290 if (!result)
4291 return PyErr_NoMemory();
4292 PyObject_Init(result, &EncodingMapType);
4293 mresult = (struct encoding_map*)result;
4294 mresult->count2 = count2;
4295 mresult->count3 = count3;
4296 mlevel1 = mresult->level1;
4297 mlevel2 = mresult->level23;
4298 mlevel3 = mresult->level23 + 16*count2;
4299 memcpy(mlevel1, level1, 32);
4300 memset(mlevel2, 0xFF, 16*count2);
4301 memset(mlevel3, 0, 128*count3);
4302 count3 = 0;
4303 for (i = 1; i < 256; i++) {
4304 int o1, o2, o3, i2, i3;
4305 if (decode[i] == 0xFFFE)
4306 /* unmapped character */
4307 continue;
4308 o1 = decode[i]>>11;
4309 o2 = (decode[i]>>7) & 0xF;
4310 i2 = 16*mlevel1[o1] + o2;
4311 if (mlevel2[i2] == 0xFF)
4312 mlevel2[i2] = count3++;
4313 o3 = decode[i] & 0x7F;
4314 i3 = 128*mlevel2[i2] + o3;
4315 mlevel3[i3] = i;
4316 }
4317 return result;
4318}
4319
4320static int
4321encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4322{
4323 struct encoding_map *map = (struct encoding_map*)mapping;
4324 int l1 = c>>11;
4325 int l2 = (c>>7) & 0xF;
4326 int l3 = c & 0x7F;
4327 int i;
4328
4329#ifdef Py_UNICODE_WIDE
4330 if (c > 0xFFFF) {
4331 return -1;
4332 }
4333#endif
4334 if (c == 0)
4335 return 0;
4336 /* level 1*/
4337 i = map->level1[l1];
4338 if (i == 0xFF) {
4339 return -1;
4340 }
4341 /* level 2*/
4342 i = map->level23[16*i+l2];
4343 if (i == 0xFF) {
4344 return -1;
4345 }
4346 /* level 3 */
4347 i = map->level23[16*map->count2 + 128*i + l3];
4348 if (i == 0) {
4349 return -1;
4350 }
4351 return i;
4352}
4353
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354/* Lookup the character ch in the mapping. If the character
4355 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004356 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358{
Christian Heimes217cfd12007-12-02 14:31:20 +00004359 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 PyObject *x;
4361
4362 if (w == NULL)
4363 return NULL;
4364 x = PyObject_GetItem(mapping, w);
4365 Py_DECREF(w);
4366 if (x == NULL) {
4367 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4368 /* No mapping found means: mapping is undefined. */
4369 PyErr_Clear();
4370 x = Py_None;
4371 Py_INCREF(x);
4372 return x;
4373 } else
4374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004376 else if (x == Py_None)
4377 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004378 else if (PyLong_Check(x)) {
4379 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 if (value < 0 || value > 255) {
4381 PyErr_SetString(PyExc_TypeError,
4382 "character mapping must be in range(256)");
4383 Py_DECREF(x);
4384 return NULL;
4385 }
4386 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 else if (PyString_Check(x))
4389 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004392 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004393 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004394 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 Py_DECREF(x);
4396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 }
4398}
4399
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004400static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004401charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004402{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004403 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004404 /* exponentially overallocate to minimize reallocations */
4405 if (requiredsize < 2*outsize)
4406 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004407 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004408 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004409 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004410}
4411
4412typedef enum charmapencode_result {
4413 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4414}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004416 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 space is available. Return a new reference to the object that
4418 was put in the output buffer, or Py_None, if the mapping was undefined
4419 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004420 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004422charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004423 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004425 PyObject *rep;
4426 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004427 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428
Christian Heimes90aa7642007-12-19 02:45:37 +00004429 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004430 int res = encoding_map_lookup(c, mapping);
4431 Py_ssize_t requiredsize = *outpos+1;
4432 if (res == -1)
4433 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004434 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004435 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004436 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004437 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004438 outstart[(*outpos)++] = (char)res;
4439 return enc_SUCCESS;
4440 }
4441
4442 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004444 return enc_EXCEPTION;
4445 else if (rep==Py_None) {
4446 Py_DECREF(rep);
4447 return enc_FAILED;
4448 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004449 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004450 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004451 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004452 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004454 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004456 outstart = PyString_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004457 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 }
4459 else {
4460 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004461 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4462 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004463 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004464 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004466 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004468 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469 memcpy(outstart + *outpos, repchars, repsize);
4470 *outpos += repsize;
4471 }
4472 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004473 Py_DECREF(rep);
4474 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475}
4476
4477/* handle an error in PyUnicode_EncodeCharmap
4478 Return 0 on success, -1 on error */
4479static
4480int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004481 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004483 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004484 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485{
4486 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004487 Py_ssize_t repsize;
4488 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 Py_UNICODE *uni2;
4490 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004491 Py_ssize_t collstartpos = *inpos;
4492 Py_ssize_t collendpos = *inpos+1;
4493 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 char *encoding = "charmap";
4495 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004496 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 /* find all unencodable characters */
4499 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004500 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004501 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004502 int res = encoding_map_lookup(p[collendpos], mapping);
4503 if (res != -1)
4504 break;
4505 ++collendpos;
4506 continue;
4507 }
4508
4509 rep = charmapencode_lookup(p[collendpos], mapping);
4510 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004512 else if (rep!=Py_None) {
4513 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 break;
4515 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004516 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 ++collendpos;
4518 }
4519 /* cache callback name lookup
4520 * (if not done yet, i.e. it's the first error) */
4521 if (*known_errorHandler==-1) {
4522 if ((errors==NULL) || (!strcmp(errors, "strict")))
4523 *known_errorHandler = 1;
4524 else if (!strcmp(errors, "replace"))
4525 *known_errorHandler = 2;
4526 else if (!strcmp(errors, "ignore"))
4527 *known_errorHandler = 3;
4528 else if (!strcmp(errors, "xmlcharrefreplace"))
4529 *known_errorHandler = 4;
4530 else
4531 *known_errorHandler = 0;
4532 }
4533 switch (*known_errorHandler) {
4534 case 1: /* strict */
4535 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4536 return -1;
4537 case 2: /* replace */
4538 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4539 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004540 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 return -1;
4542 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004543 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4545 return -1;
4546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 }
4548 /* fall through */
4549 case 3: /* ignore */
4550 *inpos = collendpos;
4551 break;
4552 case 4: /* xmlcharrefreplace */
4553 /* generate replacement (temporarily (mis)uses p) */
4554 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4555 char buffer[2+29+1+1];
4556 char *cp;
4557 sprintf(buffer, "&#%d;", (int)p[collpos]);
4558 for (cp = buffer; *cp; ++cp) {
4559 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004560 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004562 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4564 return -1;
4565 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 }
4567 }
4568 *inpos = collendpos;
4569 break;
4570 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004571 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 encoding, reason, p, size, exceptionObject,
4573 collstartpos, collendpos, &newpos);
4574 if (repunicode == NULL)
4575 return -1;
4576 /* generate replacement */
4577 repsize = PyUnicode_GET_SIZE(repunicode);
4578 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4579 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004580 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 return -1;
4582 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004583 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4586 return -1;
4587 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 }
4589 *inpos = newpos;
4590 Py_DECREF(repunicode);
4591 }
4592 return 0;
4593}
4594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004596 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 PyObject *mapping,
4598 const char *errors)
4599{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 /* output object */
4601 PyObject *res = NULL;
4602 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004603 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004605 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 PyObject *errorHandler = NULL;
4607 PyObject *exc = NULL;
4608 /* the following variable is used for caching string comparisons
4609 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4610 * 3=ignore, 4=xmlcharrefreplace */
4611 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612
4613 /* Default to Latin-1 */
4614 if (mapping == NULL)
4615 return PyUnicode_EncodeLatin1(p, size, errors);
4616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 /* allocate enough for a simple encoding without
4618 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004619 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 if (res == NULL)
4621 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004622 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 while (inpos<size) {
4626 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004627 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004628 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004630 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 if (charmap_encoding_error(p, size, &inpos, mapping,
4632 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004633 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004634 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004635 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 else
4639 /* done with this character => adjust input position */
4640 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004644 if (respos<PyString_GET_SIZE(res))
4645 _PyString_Resize(&res, respos);
4646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 Py_XDECREF(exc);
4648 Py_XDECREF(errorHandler);
4649 return res;
4650
4651 onError:
4652 Py_XDECREF(res);
4653 Py_XDECREF(exc);
4654 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655 return NULL;
4656}
4657
4658PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4659 PyObject *mapping)
4660{
4661 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4662 PyErr_BadArgument();
4663 return NULL;
4664 }
4665 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4666 PyUnicode_GET_SIZE(unicode),
4667 mapping,
4668 NULL);
4669}
4670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004671/* create or adjust a UnicodeTranslateError */
4672static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004673 const Py_UNICODE *unicode, Py_ssize_t size,
4674 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 if (*exceptionObject == NULL) {
4678 *exceptionObject = PyUnicodeTranslateError_Create(
4679 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 }
4681 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4683 goto onError;
4684 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4685 goto onError;
4686 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4687 goto onError;
4688 return;
4689 onError:
4690 Py_DECREF(*exceptionObject);
4691 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 }
4693}
4694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695/* raises a UnicodeTranslateError */
4696static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 const Py_UNICODE *unicode, Py_ssize_t size,
4698 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 const char *reason)
4700{
4701 make_translate_exception(exceptionObject,
4702 unicode, size, startpos, endpos, reason);
4703 if (*exceptionObject != NULL)
4704 PyCodec_StrictErrors(*exceptionObject);
4705}
4706
4707/* error handling callback helper:
4708 build arguments, call the callback and check the arguments,
4709 put the result into newpos and return the replacement string, which
4710 has to be freed by the caller */
4711static PyObject *unicode_translate_call_errorhandler(const char *errors,
4712 PyObject **errorHandler,
4713 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004714 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4715 Py_ssize_t startpos, Py_ssize_t endpos,
4716 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004718 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004720 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 PyObject *restuple;
4722 PyObject *resunicode;
4723
4724 if (*errorHandler == NULL) {
4725 *errorHandler = PyCodec_LookupError(errors);
4726 if (*errorHandler == NULL)
4727 return NULL;
4728 }
4729
4730 make_translate_exception(exceptionObject,
4731 unicode, size, startpos, endpos, reason);
4732 if (*exceptionObject == NULL)
4733 return NULL;
4734
4735 restuple = PyObject_CallFunctionObjArgs(
4736 *errorHandler, *exceptionObject, NULL);
4737 if (restuple == NULL)
4738 return NULL;
4739 if (!PyTuple_Check(restuple)) {
4740 PyErr_Format(PyExc_TypeError, &argparse[4]);
4741 Py_DECREF(restuple);
4742 return NULL;
4743 }
4744 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004745 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 Py_DECREF(restuple);
4747 return NULL;
4748 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004749 if (i_newpos<0)
4750 *newpos = size+i_newpos;
4751 else
4752 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004753 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004754 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004755 Py_DECREF(restuple);
4756 return NULL;
4757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 Py_INCREF(resunicode);
4759 Py_DECREF(restuple);
4760 return resunicode;
4761}
4762
4763/* Lookup the character ch in the mapping and put the result in result,
4764 which must be decrefed by the caller.
4765 Return 0 on success, -1 on error */
4766static
4767int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4768{
Christian Heimes217cfd12007-12-02 14:31:20 +00004769 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 PyObject *x;
4771
4772 if (w == NULL)
4773 return -1;
4774 x = PyObject_GetItem(mapping, w);
4775 Py_DECREF(w);
4776 if (x == NULL) {
4777 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4778 /* No mapping found means: use 1:1 mapping. */
4779 PyErr_Clear();
4780 *result = NULL;
4781 return 0;
4782 } else
4783 return -1;
4784 }
4785 else if (x == Py_None) {
4786 *result = x;
4787 return 0;
4788 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004789 else if (PyLong_Check(x)) {
4790 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 long max = PyUnicode_GetMax();
4792 if (value < 0 || value > max) {
4793 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004794 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 Py_DECREF(x);
4796 return -1;
4797 }
4798 *result = x;
4799 return 0;
4800 }
4801 else if (PyUnicode_Check(x)) {
4802 *result = x;
4803 return 0;
4804 }
4805 else {
4806 /* wrong return value */
4807 PyErr_SetString(PyExc_TypeError,
4808 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004809 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810 return -1;
4811 }
4812}
4813/* ensure that *outobj is at least requiredsize characters long,
4814if not reallocate and adjust various state variables.
4815Return 0 on success, -1 on error */
4816static
Walter Dörwald4894c302003-10-24 14:25:28 +00004817int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004818 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004820 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004821 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004823 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004825 if (requiredsize < 2 * oldsize)
4826 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004827 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 return -1;
4829 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 }
4831 return 0;
4832}
4833/* lookup the character, put the result in the output string and adjust
4834 various state variables. Return a new reference to the object that
4835 was put in the output buffer in *result, or Py_None, if the mapping was
4836 undefined (in which case no character was written).
4837 The called must decref result.
4838 Return 0 on success, -1 on error. */
4839static
Walter Dörwald4894c302003-10-24 14:25:28 +00004840int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004841 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004842 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843{
Walter Dörwald4894c302003-10-24 14:25:28 +00004844 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 return -1;
4846 if (*res==NULL) {
4847 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004848 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 }
4850 else if (*res==Py_None)
4851 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00004852 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00004854 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 }
4856 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004857 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 if (repsize==1) {
4859 /* no overflow check, because we know that the space is enough */
4860 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4861 }
4862 else if (repsize!=0) {
4863 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004864 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004865 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004866 repsize - 1;
4867 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 return -1;
4869 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4870 *outp += repsize;
4871 }
4872 }
4873 else
4874 return -1;
4875 return 0;
4876}
4877
4878PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004879 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 PyObject *mapping,
4881 const char *errors)
4882{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 /* output object */
4884 PyObject *res = NULL;
4885 /* pointers to the beginning and end+1 of input */
4886 const Py_UNICODE *startp = p;
4887 const Py_UNICODE *endp = p + size;
4888 /* pointer into the output */
4889 Py_UNICODE *str;
4890 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004891 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 char *reason = "character maps to <undefined>";
4893 PyObject *errorHandler = NULL;
4894 PyObject *exc = NULL;
4895 /* the following variable is used for caching string comparisons
4896 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4897 * 3=ignore, 4=xmlcharrefreplace */
4898 int known_errorHandler = -1;
4899
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 if (mapping == NULL) {
4901 PyErr_BadArgument();
4902 return NULL;
4903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904
4905 /* allocate enough for a simple 1:1 translation without
4906 replacements, if we need more, we'll resize */
4907 res = PyUnicode_FromUnicode(NULL, size);
4908 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004909 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 return res;
4912 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 while (p<endp) {
4915 /* try to encode it */
4916 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004917 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 goto onError;
4920 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004921 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 if (x!=Py_None) /* it worked => adjust input pointer */
4923 ++p;
4924 else { /* untranslatable character */
4925 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004926 Py_ssize_t repsize;
4927 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 Py_UNICODE *uni2;
4929 /* startpos for collecting untranslatable chars */
4930 const Py_UNICODE *collstart = p;
4931 const Py_UNICODE *collend = p+1;
4932 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 /* find all untranslatable characters */
4935 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004936 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 goto onError;
4938 Py_XDECREF(x);
4939 if (x!=Py_None)
4940 break;
4941 ++collend;
4942 }
4943 /* cache callback name lookup
4944 * (if not done yet, i.e. it's the first error) */
4945 if (known_errorHandler==-1) {
4946 if ((errors==NULL) || (!strcmp(errors, "strict")))
4947 known_errorHandler = 1;
4948 else if (!strcmp(errors, "replace"))
4949 known_errorHandler = 2;
4950 else if (!strcmp(errors, "ignore"))
4951 known_errorHandler = 3;
4952 else if (!strcmp(errors, "xmlcharrefreplace"))
4953 known_errorHandler = 4;
4954 else
4955 known_errorHandler = 0;
4956 }
4957 switch (known_errorHandler) {
4958 case 1: /* strict */
4959 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4960 goto onError;
4961 case 2: /* replace */
4962 /* No need to check for space, this is a 1:1 replacement */
4963 for (coll = collstart; coll<collend; ++coll)
4964 *str++ = '?';
4965 /* fall through */
4966 case 3: /* ignore */
4967 p = collend;
4968 break;
4969 case 4: /* xmlcharrefreplace */
4970 /* generate replacement (temporarily (mis)uses p) */
4971 for (p = collstart; p < collend; ++p) {
4972 char buffer[2+29+1+1];
4973 char *cp;
4974 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004975 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4977 goto onError;
4978 for (cp = buffer; *cp; ++cp)
4979 *str++ = *cp;
4980 }
4981 p = collend;
4982 break;
4983 default:
4984 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4985 reason, startp, size, &exc,
4986 collstart-startp, collend-startp, &newpos);
4987 if (repunicode == NULL)
4988 goto onError;
4989 /* generate replacement */
4990 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004991 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4993 Py_DECREF(repunicode);
4994 goto onError;
4995 }
4996 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4997 *str++ = *uni2;
4998 p = startp + newpos;
4999 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 }
5001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005003 /* Resize if we allocated to much */
5004 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005005 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005006 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 }
5009 Py_XDECREF(exc);
5010 Py_XDECREF(errorHandler);
5011 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 onError:
5014 Py_XDECREF(res);
5015 Py_XDECREF(exc);
5016 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017 return NULL;
5018}
5019
5020PyObject *PyUnicode_Translate(PyObject *str,
5021 PyObject *mapping,
5022 const char *errors)
5023{
5024 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005025
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026 str = PyUnicode_FromObject(str);
5027 if (str == NULL)
5028 goto onError;
5029 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5030 PyUnicode_GET_SIZE(str),
5031 mapping,
5032 errors);
5033 Py_DECREF(str);
5034 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005035
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 onError:
5037 Py_XDECREF(str);
5038 return NULL;
5039}
Tim Petersced69f82003-09-16 20:30:58 +00005040
Guido van Rossum9e896b32000-04-05 20:11:21 +00005041/* --- Decimal Encoder ---------------------------------------------------- */
5042
5043int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005044 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005045 char *output,
5046 const char *errors)
5047{
5048 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005049 PyObject *errorHandler = NULL;
5050 PyObject *exc = NULL;
5051 const char *encoding = "decimal";
5052 const char *reason = "invalid decimal Unicode string";
5053 /* the following variable is used for caching string comparisons
5054 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5055 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005056
5057 if (output == NULL) {
5058 PyErr_BadArgument();
5059 return -1;
5060 }
5061
5062 p = s;
5063 end = s + length;
5064 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005066 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005067 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005068 Py_ssize_t repsize;
5069 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005070 Py_UNICODE *uni2;
5071 Py_UNICODE *collstart;
5072 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005073
Guido van Rossum9e896b32000-04-05 20:11:21 +00005074 if (Py_UNICODE_ISSPACE(ch)) {
5075 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005077 continue;
5078 }
5079 decimal = Py_UNICODE_TODECIMAL(ch);
5080 if (decimal >= 0) {
5081 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005083 continue;
5084 }
Guido van Rossumba477042000-04-06 18:18:10 +00005085 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005086 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005087 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005088 continue;
5089 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005090 /* All other characters are considered unencodable */
5091 collstart = p;
5092 collend = p+1;
5093 while (collend < end) {
5094 if ((0 < *collend && *collend < 256) ||
5095 !Py_UNICODE_ISSPACE(*collend) ||
5096 Py_UNICODE_TODECIMAL(*collend))
5097 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005099 /* cache callback name lookup
5100 * (if not done yet, i.e. it's the first error) */
5101 if (known_errorHandler==-1) {
5102 if ((errors==NULL) || (!strcmp(errors, "strict")))
5103 known_errorHandler = 1;
5104 else if (!strcmp(errors, "replace"))
5105 known_errorHandler = 2;
5106 else if (!strcmp(errors, "ignore"))
5107 known_errorHandler = 3;
5108 else if (!strcmp(errors, "xmlcharrefreplace"))
5109 known_errorHandler = 4;
5110 else
5111 known_errorHandler = 0;
5112 }
5113 switch (known_errorHandler) {
5114 case 1: /* strict */
5115 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5116 goto onError;
5117 case 2: /* replace */
5118 for (p = collstart; p < collend; ++p)
5119 *output++ = '?';
5120 /* fall through */
5121 case 3: /* ignore */
5122 p = collend;
5123 break;
5124 case 4: /* xmlcharrefreplace */
5125 /* generate replacement (temporarily (mis)uses p) */
5126 for (p = collstart; p < collend; ++p)
5127 output += sprintf(output, "&#%d;", (int)*p);
5128 p = collend;
5129 break;
5130 default:
5131 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5132 encoding, reason, s, length, &exc,
5133 collstart-s, collend-s, &newpos);
5134 if (repunicode == NULL)
5135 goto onError;
5136 /* generate replacement */
5137 repsize = PyUnicode_GET_SIZE(repunicode);
5138 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5139 Py_UNICODE ch = *uni2;
5140 if (Py_UNICODE_ISSPACE(ch))
5141 *output++ = ' ';
5142 else {
5143 decimal = Py_UNICODE_TODECIMAL(ch);
5144 if (decimal >= 0)
5145 *output++ = '0' + decimal;
5146 else if (0 < ch && ch < 256)
5147 *output++ = (char)ch;
5148 else {
5149 Py_DECREF(repunicode);
5150 raise_encode_exception(&exc, encoding,
5151 s, length, collstart-s, collend-s, reason);
5152 goto onError;
5153 }
5154 }
5155 }
5156 p = s + newpos;
5157 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005158 }
5159 }
5160 /* 0-terminate the output string */
5161 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 Py_XDECREF(exc);
5163 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005164 return 0;
5165
5166 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005167 Py_XDECREF(exc);
5168 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005169 return -1;
5170}
5171
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172/* --- Helpers ------------------------------------------------------------ */
5173
Eric Smith8c663262007-08-25 02:26:07 +00005174#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005175#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005176#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005177/* Include _ParseTupleFinds from find.h */
5178#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005179#include "stringlib/find.h"
5180#include "stringlib/partition.h"
5181
5182/* helper macro to fixup start/end slice values */
5183#define FIX_START_END(obj) \
5184 if (start < 0) \
5185 start += (obj)->length; \
5186 if (start < 0) \
5187 start = 0; \
5188 if (end > (obj)->length) \
5189 end = (obj)->length; \
5190 if (end < 0) \
5191 end += (obj)->length; \
5192 if (end < 0) \
5193 end = 0;
5194
Martin v. Löwis18e16552006-02-15 17:27:45 +00005195Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005196 PyObject *substr,
5197 Py_ssize_t start,
5198 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005200 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005201 PyUnicodeObject* str_obj;
5202 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005203
Thomas Wouters477c8d52006-05-27 19:21:47 +00005204 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5205 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005207 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5208 if (!sub_obj) {
5209 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 return -1;
5211 }
Tim Petersced69f82003-09-16 20:30:58 +00005212
Thomas Wouters477c8d52006-05-27 19:21:47 +00005213 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005214
Thomas Wouters477c8d52006-05-27 19:21:47 +00005215 result = stringlib_count(
5216 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5217 );
5218
5219 Py_DECREF(sub_obj);
5220 Py_DECREF(str_obj);
5221
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 return result;
5223}
5224
Martin v. Löwis18e16552006-02-15 17:27:45 +00005225Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005226 PyObject *sub,
5227 Py_ssize_t start,
5228 Py_ssize_t end,
5229 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005231 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005232
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005234 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005235 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005236 sub = PyUnicode_FromObject(sub);
5237 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005238 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005239 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 }
Tim Petersced69f82003-09-16 20:30:58 +00005241
Thomas Wouters477c8d52006-05-27 19:21:47 +00005242 if (direction > 0)
5243 result = stringlib_find_slice(
5244 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5245 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5246 start, end
5247 );
5248 else
5249 result = stringlib_rfind_slice(
5250 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5251 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5252 start, end
5253 );
5254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005256 Py_DECREF(sub);
5257
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 return result;
5259}
5260
Tim Petersced69f82003-09-16 20:30:58 +00005261static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262int tailmatch(PyUnicodeObject *self,
5263 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005264 Py_ssize_t start,
5265 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 int direction)
5267{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 if (substring->length == 0)
5269 return 1;
5270
Thomas Wouters477c8d52006-05-27 19:21:47 +00005271 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
5273 end -= substring->length;
5274 if (end < start)
5275 return 0;
5276
5277 if (direction > 0) {
5278 if (Py_UNICODE_MATCH(self, end, substring))
5279 return 1;
5280 } else {
5281 if (Py_UNICODE_MATCH(self, start, substring))
5282 return 1;
5283 }
5284
5285 return 0;
5286}
5287
Martin v. Löwis18e16552006-02-15 17:27:45 +00005288Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005290 Py_ssize_t start,
5291 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 int direction)
5293{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005294 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005295
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 str = PyUnicode_FromObject(str);
5297 if (str == NULL)
5298 return -1;
5299 substr = PyUnicode_FromObject(substr);
5300 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005301 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 return -1;
5303 }
Tim Petersced69f82003-09-16 20:30:58 +00005304
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 result = tailmatch((PyUnicodeObject *)str,
5306 (PyUnicodeObject *)substr,
5307 start, end, direction);
5308 Py_DECREF(str);
5309 Py_DECREF(substr);
5310 return result;
5311}
5312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313/* Apply fixfct filter to the Unicode object self and return a
5314 reference to the modified object */
5315
Tim Petersced69f82003-09-16 20:30:58 +00005316static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317PyObject *fixup(PyUnicodeObject *self,
5318 int (*fixfct)(PyUnicodeObject *s))
5319{
5320
5321 PyUnicodeObject *u;
5322
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005323 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 if (u == NULL)
5325 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005326
5327 Py_UNICODE_COPY(u->str, self->str, self->length);
5328
Tim Peters7a29bd52001-09-12 03:03:31 +00005329 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 /* fixfct should return TRUE if it modified the buffer. If
5331 FALSE, return a reference to the original buffer instead
5332 (to save space, not time) */
5333 Py_INCREF(self);
5334 Py_DECREF(u);
5335 return (PyObject*) self;
5336 }
5337 return (PyObject*) u;
5338}
5339
Tim Petersced69f82003-09-16 20:30:58 +00005340static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341int fixupper(PyUnicodeObject *self)
5342{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005343 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 Py_UNICODE *s = self->str;
5345 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005346
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 while (len-- > 0) {
5348 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005349
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 ch = Py_UNICODE_TOUPPER(*s);
5351 if (ch != *s) {
5352 status = 1;
5353 *s = ch;
5354 }
5355 s++;
5356 }
5357
5358 return status;
5359}
5360
Tim Petersced69f82003-09-16 20:30:58 +00005361static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362int fixlower(PyUnicodeObject *self)
5363{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005364 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 Py_UNICODE *s = self->str;
5366 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 while (len-- > 0) {
5369 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 ch = Py_UNICODE_TOLOWER(*s);
5372 if (ch != *s) {
5373 status = 1;
5374 *s = ch;
5375 }
5376 s++;
5377 }
5378
5379 return status;
5380}
5381
Tim Petersced69f82003-09-16 20:30:58 +00005382static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383int fixswapcase(PyUnicodeObject *self)
5384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 Py_UNICODE *s = self->str;
5387 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 while (len-- > 0) {
5390 if (Py_UNICODE_ISUPPER(*s)) {
5391 *s = Py_UNICODE_TOLOWER(*s);
5392 status = 1;
5393 } else if (Py_UNICODE_ISLOWER(*s)) {
5394 *s = Py_UNICODE_TOUPPER(*s);
5395 status = 1;
5396 }
5397 s++;
5398 }
5399
5400 return status;
5401}
5402
Tim Petersced69f82003-09-16 20:30:58 +00005403static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404int fixcapitalize(PyUnicodeObject *self)
5405{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005406 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005407 Py_UNICODE *s = self->str;
5408 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005409
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005410 if (len == 0)
5411 return 0;
5412 if (Py_UNICODE_ISLOWER(*s)) {
5413 *s = Py_UNICODE_TOUPPER(*s);
5414 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005416 s++;
5417 while (--len > 0) {
5418 if (Py_UNICODE_ISUPPER(*s)) {
5419 *s = Py_UNICODE_TOLOWER(*s);
5420 status = 1;
5421 }
5422 s++;
5423 }
5424 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425}
5426
5427static
5428int fixtitle(PyUnicodeObject *self)
5429{
5430 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5431 register Py_UNICODE *e;
5432 int previous_is_cased;
5433
5434 /* Shortcut for single character strings */
5435 if (PyUnicode_GET_SIZE(self) == 1) {
5436 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5437 if (*p != ch) {
5438 *p = ch;
5439 return 1;
5440 }
5441 else
5442 return 0;
5443 }
Tim Petersced69f82003-09-16 20:30:58 +00005444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 e = p + PyUnicode_GET_SIZE(self);
5446 previous_is_cased = 0;
5447 for (; p < e; p++) {
5448 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 if (previous_is_cased)
5451 *p = Py_UNICODE_TOLOWER(ch);
5452 else
5453 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005454
5455 if (Py_UNICODE_ISLOWER(ch) ||
5456 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 Py_UNICODE_ISTITLE(ch))
5458 previous_is_cased = 1;
5459 else
5460 previous_is_cased = 0;
5461 }
5462 return 1;
5463}
5464
Tim Peters8ce9f162004-08-27 01:49:32 +00005465PyObject *
5466PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467{
Tim Peters8ce9f162004-08-27 01:49:32 +00005468 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005469 const Py_UNICODE blank = ' ';
5470 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005471 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005472 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005473 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5474 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005475 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5476 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005477 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005478 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005479 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480
Tim Peters05eba1f2004-08-27 21:32:02 +00005481 fseq = PySequence_Fast(seq, "");
5482 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005483 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005484 }
5485
Tim Peters91879ab2004-08-27 22:35:44 +00005486 /* Grrrr. A codec may be invoked to convert str objects to
5487 * Unicode, and so it's possible to call back into Python code
5488 * during PyUnicode_FromObject(), and so it's possible for a sick
5489 * codec to change the size of fseq (if seq is a list). Therefore
5490 * we have to keep refetching the size -- can't assume seqlen
5491 * is invariant.
5492 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005493 seqlen = PySequence_Fast_GET_SIZE(fseq);
5494 /* If empty sequence, return u"". */
5495 if (seqlen == 0) {
5496 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5497 goto Done;
5498 }
5499 /* If singleton sequence with an exact Unicode, return that. */
5500 if (seqlen == 1) {
5501 item = PySequence_Fast_GET_ITEM(fseq, 0);
5502 if (PyUnicode_CheckExact(item)) {
5503 Py_INCREF(item);
5504 res = (PyUnicodeObject *)item;
5505 goto Done;
5506 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005507 }
5508
Tim Peters05eba1f2004-08-27 21:32:02 +00005509 /* At least two items to join, or one that isn't exact Unicode. */
5510 if (seqlen > 1) {
5511 /* Set up sep and seplen -- they're needed. */
5512 if (separator == NULL) {
5513 sep = &blank;
5514 seplen = 1;
5515 }
5516 else {
5517 internal_separator = PyUnicode_FromObject(separator);
5518 if (internal_separator == NULL)
5519 goto onError;
5520 sep = PyUnicode_AS_UNICODE(internal_separator);
5521 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005522 /* In case PyUnicode_FromObject() mutated seq. */
5523 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005524 }
5525 }
5526
5527 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005528 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005529 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005530 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005531 res_p = PyUnicode_AS_UNICODE(res);
5532 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005533
Tim Peters05eba1f2004-08-27 21:32:02 +00005534 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005535 Py_ssize_t itemlen;
5536 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005537
5538 item = PySequence_Fast_GET_ITEM(fseq, i);
5539 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005540 if (!PyUnicode_Check(item)) {
5541 PyErr_Format(PyExc_TypeError,
5542 "sequence item %zd: expected str instance,"
5543 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005544 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005545 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005546 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005547 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005548 if (item == NULL)
5549 goto onError;
5550 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005551
Tim Peters91879ab2004-08-27 22:35:44 +00005552 /* In case PyUnicode_FromObject() mutated seq. */
5553 seqlen = PySequence_Fast_GET_SIZE(fseq);
5554
Tim Peters8ce9f162004-08-27 01:49:32 +00005555 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005558 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005559 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005560 if (i < seqlen - 1) {
5561 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005562 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005563 goto Overflow;
5564 }
5565 if (new_res_used > res_alloc) {
5566 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005567 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005568 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005569 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005570 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005571 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005572 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005573 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005575 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005576 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005578
5579 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005580 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005581 res_p += itemlen;
5582 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005583 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 res_p += seplen;
5585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005587 res_used = new_res_used;
5588 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005589
Tim Peters05eba1f2004-08-27 21:32:02 +00005590 /* Shrink res to match the used area; this probably can't fail,
5591 * but it's cheap to check.
5592 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005593 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005594 goto onError;
5595
5596 Done:
5597 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005598 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 return (PyObject *)res;
5600
Tim Peters8ce9f162004-08-27 01:49:32 +00005601 Overflow:
5602 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005603 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005604 Py_DECREF(item);
5605 /* fall through */
5606
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005608 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005610 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 return NULL;
5612}
5613
Tim Petersced69f82003-09-16 20:30:58 +00005614static
5615PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005616 Py_ssize_t left,
5617 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 Py_UNICODE fill)
5619{
5620 PyUnicodeObject *u;
5621
5622 if (left < 0)
5623 left = 0;
5624 if (right < 0)
5625 right = 0;
5626
Tim Peters7a29bd52001-09-12 03:03:31 +00005627 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 Py_INCREF(self);
5629 return self;
5630 }
5631
5632 u = _PyUnicode_New(left + self->length + right);
5633 if (u) {
5634 if (left)
5635 Py_UNICODE_FILL(u->str, fill, left);
5636 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5637 if (right)
5638 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5639 }
5640
5641 return u;
5642}
5643
5644#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005645 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 if (!str) \
5647 goto onError; \
5648 if (PyList_Append(list, str)) { \
5649 Py_DECREF(str); \
5650 goto onError; \
5651 } \
5652 else \
5653 Py_DECREF(str);
5654
5655static
5656PyObject *split_whitespace(PyUnicodeObject *self,
5657 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005658 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 register Py_ssize_t i;
5661 register Py_ssize_t j;
5662 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005664 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665
5666 for (i = j = 0; i < len; ) {
5667 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005668 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 i++;
5670 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005671 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 i++;
5673 if (j < i) {
5674 if (maxcount-- <= 0)
5675 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005676 SPLIT_APPEND(buf, j, i);
5677 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 i++;
5679 j = i;
5680 }
5681 }
5682 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005683 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 }
5685 return list;
5686
5687 onError:
5688 Py_DECREF(list);
5689 return NULL;
5690}
5691
5692PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005693 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005695 register Py_ssize_t i;
5696 register Py_ssize_t j;
5697 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 PyObject *list;
5699 PyObject *str;
5700 Py_UNICODE *data;
5701
5702 string = PyUnicode_FromObject(string);
5703 if (string == NULL)
5704 return NULL;
5705 data = PyUnicode_AS_UNICODE(string);
5706 len = PyUnicode_GET_SIZE(string);
5707
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 list = PyList_New(0);
5709 if (!list)
5710 goto onError;
5711
5712 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005713 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005714
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005716 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
5719 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005720 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 if (i < len) {
5722 if (data[i] == '\r' && i + 1 < len &&
5723 data[i+1] == '\n')
5724 i += 2;
5725 else
5726 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005727 if (keepends)
5728 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 }
Guido van Rossum86662912000-04-11 15:38:46 +00005730 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 j = i;
5732 }
5733 if (j < len) {
5734 SPLIT_APPEND(data, j, len);
5735 }
5736
5737 Py_DECREF(string);
5738 return list;
5739
5740 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005741 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 Py_DECREF(string);
5743 return NULL;
5744}
5745
Tim Petersced69f82003-09-16 20:30:58 +00005746static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747PyObject *split_char(PyUnicodeObject *self,
5748 PyObject *list,
5749 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005750 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 register Py_ssize_t i;
5753 register Py_ssize_t j;
5754 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005756 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
5758 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005759 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 if (maxcount-- <= 0)
5761 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005762 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 i = j = i + 1;
5764 } else
5765 i++;
5766 }
5767 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005768 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 }
5770 return list;
5771
5772 onError:
5773 Py_DECREF(list);
5774 return NULL;
5775}
5776
Tim Petersced69f82003-09-16 20:30:58 +00005777static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778PyObject *split_substring(PyUnicodeObject *self,
5779 PyObject *list,
5780 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005781 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005783 register Py_ssize_t i;
5784 register Py_ssize_t j;
5785 Py_ssize_t len = self->length;
5786 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 PyObject *str;
5788
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005789 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 if (Py_UNICODE_MATCH(self, i, substring)) {
5791 if (maxcount-- <= 0)
5792 break;
5793 SPLIT_APPEND(self->str, j, i);
5794 i = j = i + sublen;
5795 } else
5796 i++;
5797 }
5798 if (j <= len) {
5799 SPLIT_APPEND(self->str, j, len);
5800 }
5801 return list;
5802
5803 onError:
5804 Py_DECREF(list);
5805 return NULL;
5806}
5807
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005808static
5809PyObject *rsplit_whitespace(PyUnicodeObject *self,
5810 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005811 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005812{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005813 register Py_ssize_t i;
5814 register Py_ssize_t j;
5815 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005816 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005817 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818
5819 for (i = j = len - 1; i >= 0; ) {
5820 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005821 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005822 i--;
5823 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005824 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005825 i--;
5826 if (j > i) {
5827 if (maxcount-- <= 0)
5828 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005829 SPLIT_APPEND(buf, i + 1, j + 1);
5830 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005831 i--;
5832 j = i;
5833 }
5834 }
5835 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005836 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005837 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005838 if (PyList_Reverse(list) < 0)
5839 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005840 return list;
5841
5842 onError:
5843 Py_DECREF(list);
5844 return NULL;
5845}
5846
5847static
5848PyObject *rsplit_char(PyUnicodeObject *self,
5849 PyObject *list,
5850 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005851 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005853 register Py_ssize_t i;
5854 register Py_ssize_t j;
5855 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005856 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005857 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005858
5859 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005860 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005861 if (maxcount-- <= 0)
5862 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005863 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005864 j = i = i - 1;
5865 } else
5866 i--;
5867 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005868 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005869 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005870 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005871 if (PyList_Reverse(list) < 0)
5872 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005873 return list;
5874
5875 onError:
5876 Py_DECREF(list);
5877 return NULL;
5878}
5879
5880static
5881PyObject *rsplit_substring(PyUnicodeObject *self,
5882 PyObject *list,
5883 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005884 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005885{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005886 register Py_ssize_t i;
5887 register Py_ssize_t j;
5888 Py_ssize_t len = self->length;
5889 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005890 PyObject *str;
5891
5892 for (i = len - sublen, j = len; i >= 0; ) {
5893 if (Py_UNICODE_MATCH(self, i, substring)) {
5894 if (maxcount-- <= 0)
5895 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005896 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005897 j = i;
5898 i -= sublen;
5899 } else
5900 i--;
5901 }
5902 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005903 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005904 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005905 if (PyList_Reverse(list) < 0)
5906 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005907 return list;
5908
5909 onError:
5910 Py_DECREF(list);
5911 return NULL;
5912}
5913
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914#undef SPLIT_APPEND
5915
5916static
5917PyObject *split(PyUnicodeObject *self,
5918 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005919 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920{
5921 PyObject *list;
5922
5923 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005924 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
5926 list = PyList_New(0);
5927 if (!list)
5928 return NULL;
5929
5930 if (substring == NULL)
5931 return split_whitespace(self,list,maxcount);
5932
5933 else if (substring->length == 1)
5934 return split_char(self,list,substring->str[0],maxcount);
5935
5936 else if (substring->length == 0) {
5937 Py_DECREF(list);
5938 PyErr_SetString(PyExc_ValueError, "empty separator");
5939 return NULL;
5940 }
5941 else
5942 return split_substring(self,list,substring,maxcount);
5943}
5944
Tim Petersced69f82003-09-16 20:30:58 +00005945static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005946PyObject *rsplit(PyUnicodeObject *self,
5947 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005948 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005949{
5950 PyObject *list;
5951
5952 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005953 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005954
5955 list = PyList_New(0);
5956 if (!list)
5957 return NULL;
5958
5959 if (substring == NULL)
5960 return rsplit_whitespace(self,list,maxcount);
5961
5962 else if (substring->length == 1)
5963 return rsplit_char(self,list,substring->str[0],maxcount);
5964
5965 else if (substring->length == 0) {
5966 Py_DECREF(list);
5967 PyErr_SetString(PyExc_ValueError, "empty separator");
5968 return NULL;
5969 }
5970 else
5971 return rsplit_substring(self,list,substring,maxcount);
5972}
5973
5974static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975PyObject *replace(PyUnicodeObject *self,
5976 PyUnicodeObject *str1,
5977 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005978 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979{
5980 PyUnicodeObject *u;
5981
5982 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005983 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Thomas Wouters477c8d52006-05-27 19:21:47 +00005985 if (str1->length == str2->length) {
5986 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005987 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005988 if (str1->length == 1) {
5989 /* replace characters */
5990 Py_UNICODE u1, u2;
5991 if (!findchar(self->str, self->length, str1->str[0]))
5992 goto nothing;
5993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5994 if (!u)
5995 return NULL;
5996 Py_UNICODE_COPY(u->str, self->str, self->length);
5997 u1 = str1->str[0];
5998 u2 = str2->str[0];
5999 for (i = 0; i < u->length; i++)
6000 if (u->str[i] == u1) {
6001 if (--maxcount < 0)
6002 break;
6003 u->str[i] = u2;
6004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006006 i = fastsearch(
6007 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006009 if (i < 0)
6010 goto nothing;
6011 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6012 if (!u)
6013 return NULL;
6014 Py_UNICODE_COPY(u->str, self->str, self->length);
6015 while (i <= self->length - str1->length)
6016 if (Py_UNICODE_MATCH(self, i, str1)) {
6017 if (--maxcount < 0)
6018 break;
6019 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6020 i += str1->length;
6021 } else
6022 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006025
6026 Py_ssize_t n, i, j, e;
6027 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 Py_UNICODE *p;
6029
6030 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006031 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 if (n > maxcount)
6033 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006034 if (n == 0)
6035 goto nothing;
6036 /* new_size = self->length + n * (str2->length - str1->length)); */
6037 delta = (str2->length - str1->length);
6038 if (delta == 0) {
6039 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006041 product = n * (str2->length - str1->length);
6042 if ((product / (str2->length - str1->length)) != n) {
6043 PyErr_SetString(PyExc_OverflowError,
6044 "replace string is too long");
6045 return NULL;
6046 }
6047 new_size = self->length + product;
6048 if (new_size < 0) {
6049 PyErr_SetString(PyExc_OverflowError,
6050 "replace string is too long");
6051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 }
6053 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006054 u = _PyUnicode_New(new_size);
6055 if (!u)
6056 return NULL;
6057 i = 0;
6058 p = u->str;
6059 e = self->length - str1->length;
6060 if (str1->length > 0) {
6061 while (n-- > 0) {
6062 /* look for next match */
6063 j = i;
6064 while (j <= e) {
6065 if (Py_UNICODE_MATCH(self, j, str1))
6066 break;
6067 j++;
6068 }
6069 if (j > i) {
6070 if (j > e)
6071 break;
6072 /* copy unchanged part [i:j] */
6073 Py_UNICODE_COPY(p, self->str+i, j-i);
6074 p += j - i;
6075 }
6076 /* copy substitution string */
6077 if (str2->length > 0) {
6078 Py_UNICODE_COPY(p, str2->str, str2->length);
6079 p += str2->length;
6080 }
6081 i = j + str1->length;
6082 }
6083 if (i < self->length)
6084 /* copy tail [i:] */
6085 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6086 } else {
6087 /* interleave */
6088 while (n > 0) {
6089 Py_UNICODE_COPY(p, str2->str, str2->length);
6090 p += str2->length;
6091 if (--n <= 0)
6092 break;
6093 *p++ = self->str[i++];
6094 }
6095 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006099
6100nothing:
6101 /* nothing to replace; return original string (when possible) */
6102 if (PyUnicode_CheckExact(self)) {
6103 Py_INCREF(self);
6104 return (PyObject *) self;
6105 }
6106 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107}
6108
6109/* --- Unicode Object Methods --------------------------------------------- */
6110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006111PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112"S.title() -> unicode\n\
6113\n\
6114Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006115characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
6117static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006118unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 return fixup(self, fixtitle);
6121}
6122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006123PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124"S.capitalize() -> unicode\n\
6125\n\
6126Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006127have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
6129static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006130unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 return fixup(self, fixcapitalize);
6133}
6134
6135#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006136PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137"S.capwords() -> unicode\n\
6138\n\
6139Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006140normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141
6142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006143unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144{
6145 PyObject *list;
6146 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006147 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 /* Split into words */
6150 list = split(self, NULL, -1);
6151 if (!list)
6152 return NULL;
6153
6154 /* Capitalize each word */
6155 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6156 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6157 fixcapitalize);
6158 if (item == NULL)
6159 goto onError;
6160 Py_DECREF(PyList_GET_ITEM(list, i));
6161 PyList_SET_ITEM(list, i, item);
6162 }
6163
6164 /* Join the words to form a new string */
6165 item = PyUnicode_Join(NULL, list);
6166
6167onError:
6168 Py_DECREF(list);
6169 return (PyObject *)item;
6170}
6171#endif
6172
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006173/* Argument converter. Coerces to a single unicode character */
6174
6175static int
6176convert_uc(PyObject *obj, void *addr)
6177{
6178 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6179 PyObject *uniobj;
6180 Py_UNICODE *unistr;
6181
6182 uniobj = PyUnicode_FromObject(obj);
6183 if (uniobj == NULL) {
6184 PyErr_SetString(PyExc_TypeError,
6185 "The fill character cannot be converted to Unicode");
6186 return 0;
6187 }
6188 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6189 PyErr_SetString(PyExc_TypeError,
6190 "The fill character must be exactly one character long");
6191 Py_DECREF(uniobj);
6192 return 0;
6193 }
6194 unistr = PyUnicode_AS_UNICODE(uniobj);
6195 *fillcharloc = unistr[0];
6196 Py_DECREF(uniobj);
6197 return 1;
6198}
6199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006200PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006201"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006203Return S centered in a Unicode string of length width. Padding is\n\
6204done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205
6206static PyObject *
6207unicode_center(PyUnicodeObject *self, PyObject *args)
6208{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006209 Py_ssize_t marg, left;
6210 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006211 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212
Thomas Woutersde017742006-02-16 19:34:37 +00006213 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 return NULL;
6215
Tim Peters7a29bd52001-09-12 03:03:31 +00006216 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 Py_INCREF(self);
6218 return (PyObject*) self;
6219 }
6220
6221 marg = width - self->length;
6222 left = marg / 2 + (marg & width & 1);
6223
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006224 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225}
6226
Marc-André Lemburge5034372000-08-08 08:04:29 +00006227#if 0
6228
6229/* This code should go into some future Unicode collation support
6230 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006231 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006232
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006233/* speedy UTF-16 code point order comparison */
6234/* gleaned from: */
6235/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6236
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006237static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006238{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006239 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006240 0, 0, 0, 0, 0, 0, 0, 0,
6241 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006242 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006243};
6244
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245static int
6246unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006249
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 Py_UNICODE *s1 = str1->str;
6251 Py_UNICODE *s2 = str2->str;
6252
6253 len1 = str1->length;
6254 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006255
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006257 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006258
6259 c1 = *s1++;
6260 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006261
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006262 if (c1 > (1<<11) * 26)
6263 c1 += utf16Fixup[c1>>11];
6264 if (c2 > (1<<11) * 26)
6265 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006266 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006267
6268 if (c1 != c2)
6269 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006270
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006271 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 }
6273
6274 return (len1 < len2) ? -1 : (len1 != len2);
6275}
6276
Marc-André Lemburge5034372000-08-08 08:04:29 +00006277#else
6278
6279static int
6280unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6281{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006282 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006283
6284 Py_UNICODE *s1 = str1->str;
6285 Py_UNICODE *s2 = str2->str;
6286
6287 len1 = str1->length;
6288 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006289
Marc-André Lemburge5034372000-08-08 08:04:29 +00006290 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006291 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006292
Fredrik Lundh45714e92001-06-26 16:39:36 +00006293 c1 = *s1++;
6294 c2 = *s2++;
6295
6296 if (c1 != c2)
6297 return (c1 < c2) ? -1 : 1;
6298
Marc-André Lemburge5034372000-08-08 08:04:29 +00006299 len1--; len2--;
6300 }
6301
6302 return (len1 < len2) ? -1 : (len1 != len2);
6303}
6304
6305#endif
6306
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307int PyUnicode_Compare(PyObject *left,
6308 PyObject *right)
6309{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006310 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6311 return unicode_compare((PyUnicodeObject *)left,
6312 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006313 PyErr_Format(PyExc_TypeError,
6314 "Can't compare %.100s and %.100s",
6315 left->ob_type->tp_name,
6316 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 return -1;
6318}
6319
Martin v. Löwis5b222132007-06-10 09:51:05 +00006320int
6321PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6322{
6323 int i;
6324 Py_UNICODE *id;
6325 assert(PyUnicode_Check(uni));
6326 id = PyUnicode_AS_UNICODE(uni);
6327 /* Compare Unicode string and source character set string */
6328 for (i = 0; id[i] && str[i]; i++)
6329 if (id[i] != str[i])
6330 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6331 if (id[i])
6332 return 1; /* uni is longer */
6333 if (str[i])
6334 return -1; /* str is longer */
6335 return 0;
6336}
6337
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006338PyObject *PyUnicode_RichCompare(PyObject *left,
6339 PyObject *right,
6340 int op)
6341{
6342 int result;
6343
6344 result = PyUnicode_Compare(left, right);
6345 if (result == -1 && PyErr_Occurred())
6346 goto onError;
6347
6348 /* Convert the return value to a Boolean */
6349 switch (op) {
6350 case Py_EQ:
6351 result = (result == 0);
6352 break;
6353 case Py_NE:
6354 result = (result != 0);
6355 break;
6356 case Py_LE:
6357 result = (result <= 0);
6358 break;
6359 case Py_GE:
6360 result = (result >= 0);
6361 break;
6362 case Py_LT:
6363 result = (result == -1);
6364 break;
6365 case Py_GT:
6366 result = (result == 1);
6367 break;
6368 }
6369 return PyBool_FromLong(result);
6370
6371 onError:
6372
6373 /* Standard case
6374
6375 Type errors mean that PyUnicode_FromObject() could not convert
6376 one of the arguments (usually the right hand side) to Unicode,
6377 ie. we can't handle the comparison request. However, it is
6378 possible that the other object knows a comparison method, which
6379 is why we return Py_NotImplemented to give the other object a
6380 chance.
6381
6382 */
6383 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6384 PyErr_Clear();
6385 Py_INCREF(Py_NotImplemented);
6386 return Py_NotImplemented;
6387 }
6388 if (op != Py_EQ && op != Py_NE)
6389 return NULL;
6390
6391 /* Equality comparison.
6392
6393 This is a special case: we silence any PyExc_UnicodeDecodeError
6394 and instead turn it into a PyErr_UnicodeWarning.
6395
6396 */
6397 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6398 return NULL;
6399 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006400 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6401 (op == Py_EQ) ?
6402 "Unicode equal comparison "
6403 "failed to convert both arguments to Unicode - "
6404 "interpreting them as being unequal"
6405 :
6406 "Unicode unequal comparison "
6407 "failed to convert both arguments to Unicode - "
6408 "interpreting them as being unequal",
6409 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006410 return NULL;
6411 result = (op == Py_NE);
6412 return PyBool_FromLong(result);
6413}
6414
Guido van Rossum403d68b2000-03-13 15:55:09 +00006415int PyUnicode_Contains(PyObject *container,
6416 PyObject *element)
6417{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006418 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006419 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006420
6421 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006422 sub = PyUnicode_FromObject(element);
6423 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006424 PyErr_Format(PyExc_TypeError,
6425 "'in <string>' requires string as left operand, not %s",
6426 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006427 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006428 }
6429
Thomas Wouters477c8d52006-05-27 19:21:47 +00006430 str = PyUnicode_FromObject(container);
6431 if (!str) {
6432 Py_DECREF(sub);
6433 return -1;
6434 }
6435
6436 result = stringlib_contains_obj(str, sub);
6437
6438 Py_DECREF(str);
6439 Py_DECREF(sub);
6440
Guido van Rossum403d68b2000-03-13 15:55:09 +00006441 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006442}
6443
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444/* Concat to string or Unicode object giving a new Unicode object. */
6445
6446PyObject *PyUnicode_Concat(PyObject *left,
6447 PyObject *right)
6448{
6449 PyUnicodeObject *u = NULL, *v = NULL, *w;
6450
6451 /* Coerce the two arguments */
6452 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6453 if (u == NULL)
6454 goto onError;
6455 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6456 if (v == NULL)
6457 goto onError;
6458
6459 /* Shortcuts */
6460 if (v == unicode_empty) {
6461 Py_DECREF(v);
6462 return (PyObject *)u;
6463 }
6464 if (u == unicode_empty) {
6465 Py_DECREF(u);
6466 return (PyObject *)v;
6467 }
6468
6469 /* Concat the two Unicode strings */
6470 w = _PyUnicode_New(u->length + v->length);
6471 if (w == NULL)
6472 goto onError;
6473 Py_UNICODE_COPY(w->str, u->str, u->length);
6474 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6475
6476 Py_DECREF(u);
6477 Py_DECREF(v);
6478 return (PyObject *)w;
6479
6480onError:
6481 Py_XDECREF(u);
6482 Py_XDECREF(v);
6483 return NULL;
6484}
6485
Walter Dörwald1ab83302007-05-18 17:15:44 +00006486void
6487PyUnicode_Append(PyObject **pleft, PyObject *right)
6488{
6489 PyObject *new;
6490 if (*pleft == NULL)
6491 return;
6492 if (right == NULL || !PyUnicode_Check(*pleft)) {
6493 Py_DECREF(*pleft);
6494 *pleft = NULL;
6495 return;
6496 }
6497 new = PyUnicode_Concat(*pleft, right);
6498 Py_DECREF(*pleft);
6499 *pleft = new;
6500}
6501
6502void
6503PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6504{
6505 PyUnicode_Append(pleft, right);
6506 Py_XDECREF(right);
6507}
6508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006509PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510"S.count(sub[, start[, end]]) -> int\n\
6511\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512Return the number of non-overlapping occurrences of substring sub in\n\
6513Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006514interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515
6516static PyObject *
6517unicode_count(PyUnicodeObject *self, PyObject *args)
6518{
6519 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006520 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006521 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 PyObject *result;
6523
Guido van Rossumb8872e62000-05-09 14:14:27 +00006524 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6525 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 return NULL;
6527
6528 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006529 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 if (substring == NULL)
6531 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006532
Thomas Wouters477c8d52006-05-27 19:21:47 +00006533 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534
Christian Heimes217cfd12007-12-02 14:31:20 +00006535 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006536 stringlib_count(self->str + start, end - start,
6537 substring->str, substring->length)
6538 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539
6540 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006541
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 return result;
6543}
6544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006545PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006546"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006548Encodes S using the codec registered for encoding. encoding defaults\n\
6549to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006550handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006551a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6552'xmlcharrefreplace' as well as any other name registered with\n\
6553codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554
6555static PyObject *
6556unicode_encode(PyUnicodeObject *self, PyObject *args)
6557{
6558 char *encoding = NULL;
6559 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006560 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006561
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6563 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006564 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006565 if (v == NULL)
6566 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006567 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006568 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006569 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006570 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006571 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006572 Py_DECREF(v);
6573 return NULL;
6574 }
6575 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006576
6577 onError:
6578 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006579}
6580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006581PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582"S.expandtabs([tabsize]) -> unicode\n\
6583\n\
6584Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006585If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
6587static PyObject*
6588unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6589{
6590 Py_UNICODE *e;
6591 Py_UNICODE *p;
6592 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006593 Py_UNICODE *qe;
6594 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 PyUnicodeObject *u;
6596 int tabsize = 8;
6597
6598 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6599 return NULL;
6600
Thomas Wouters7e474022000-07-16 12:04:32 +00006601 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006602 i = 0; /* chars up to and including most recent \n or \r */
6603 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6604 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 for (p = self->str; p < e; p++)
6606 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006607 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006608 incr = tabsize - (j % tabsize); /* cannot overflow */
6609 if (j > PY_SSIZE_T_MAX - incr)
6610 goto overflow1;
6611 j += incr;
6612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 }
6614 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006615 if (j > PY_SSIZE_T_MAX - 1)
6616 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 j++;
6618 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006619 if (i > PY_SSIZE_T_MAX - j)
6620 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006622 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 }
6624 }
6625
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006626 if (i > PY_SSIZE_T_MAX - j)
6627 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006628
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 /* Second pass: create output string and fill it */
6630 u = _PyUnicode_New(i + j);
6631 if (!u)
6632 return NULL;
6633
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006634 j = 0; /* same as in first pass */
6635 q = u->str; /* next output char */
6636 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
6638 for (p = self->str; p < e; p++)
6639 if (*p == '\t') {
6640 if (tabsize > 0) {
6641 i = tabsize - (j % tabsize);
6642 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006643 while (i--) {
6644 if (q >= qe)
6645 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006647 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 }
6649 }
6650 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006651 if (q >= qe)
6652 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006654 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 if (*p == '\n' || *p == '\r')
6656 j = 0;
6657 }
6658
6659 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006660
6661 overflow2:
6662 Py_DECREF(u);
6663 overflow1:
6664 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666}
6667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669"S.find(sub [,start [,end]]) -> int\n\
6670\n\
6671Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006672such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673arguments start and end are interpreted as in slice notation.\n\
6674\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006675Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676
6677static PyObject *
6678unicode_find(PyUnicodeObject *self, PyObject *args)
6679{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006680 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006681 Py_ssize_t start;
6682 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006683 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
Christian Heimes9cd17752007-11-18 19:35:23 +00006685 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
Thomas Wouters477c8d52006-05-27 19:21:47 +00006688 result = stringlib_find_slice(
6689 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6690 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6691 start, end
6692 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
6694 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006695
Christian Heimes217cfd12007-12-02 14:31:20 +00006696 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697}
6698
6699static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006700unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701{
6702 if (index < 0 || index >= self->length) {
6703 PyErr_SetString(PyExc_IndexError, "string index out of range");
6704 return NULL;
6705 }
6706
6707 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6708}
6709
Guido van Rossumc2504932007-09-18 19:42:40 +00006710/* Believe it or not, this produces the same value for ASCII strings
6711 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006713unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714{
Guido van Rossumc2504932007-09-18 19:42:40 +00006715 Py_ssize_t len;
6716 Py_UNICODE *p;
6717 long x;
6718
6719 if (self->hash != -1)
6720 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006721 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006722 p = self->str;
6723 x = *p << 7;
6724 while (--len >= 0)
6725 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006726 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006727 if (x == -1)
6728 x = -2;
6729 self->hash = x;
6730 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731}
6732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734"S.index(sub [,start [,end]]) -> int\n\
6735\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006736Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737
6738static PyObject *
6739unicode_index(PyUnicodeObject *self, PyObject *args)
6740{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006741 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006742 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006743 Py_ssize_t start;
6744 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
Christian Heimes9cd17752007-11-18 19:35:23 +00006746 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
Thomas Wouters477c8d52006-05-27 19:21:47 +00006749 result = stringlib_find_slice(
6750 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6751 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6752 start, end
6753 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754
6755 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006756
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 if (result < 0) {
6758 PyErr_SetString(PyExc_ValueError, "substring not found");
6759 return NULL;
6760 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006761
Christian Heimes217cfd12007-12-02 14:31:20 +00006762 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763}
6764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006765PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006766"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006768Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006769at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770
6771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006772unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773{
6774 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6775 register const Py_UNICODE *e;
6776 int cased;
6777
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 /* Shortcut for single character strings */
6779 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006780 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006782 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006783 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006784 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 e = p + PyUnicode_GET_SIZE(self);
6787 cased = 0;
6788 for (; p < e; p++) {
6789 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006790
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006792 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 else if (!cased && Py_UNICODE_ISLOWER(ch))
6794 cased = 1;
6795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006796 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797}
6798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006799PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006800"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006802Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804
6805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006806unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807{
6808 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6809 register const Py_UNICODE *e;
6810 int cased;
6811
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 /* Shortcut for single character strings */
6813 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006814 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006816 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006817 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006818 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006819
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 e = p + PyUnicode_GET_SIZE(self);
6821 cased = 0;
6822 for (; p < e; p++) {
6823 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006824
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 else if (!cased && Py_UNICODE_ISUPPER(ch))
6828 cased = 1;
6829 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006830 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831}
6832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006833PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006834"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006836Return True if S is a titlecased string and there is at least one\n\
6837character in S, i.e. upper- and titlecase characters may only\n\
6838follow uncased characters and lowercase characters only cased ones.\n\
6839Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840
6841static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006842unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843{
6844 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6845 register const Py_UNICODE *e;
6846 int cased, previous_is_cased;
6847
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 /* Shortcut for single character strings */
6849 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006850 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6851 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006853 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006854 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006855 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006856
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 e = p + PyUnicode_GET_SIZE(self);
6858 cased = 0;
6859 previous_is_cased = 0;
6860 for (; p < e; p++) {
6861 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006862
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6864 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006865 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 previous_is_cased = 1;
6867 cased = 1;
6868 }
6869 else if (Py_UNICODE_ISLOWER(ch)) {
6870 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006871 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 previous_is_cased = 1;
6873 cased = 1;
6874 }
6875 else
6876 previous_is_cased = 0;
6877 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879}
6880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006881PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006884Return True if all characters in S are whitespace\n\
6885and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886
6887static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006888unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889{
6890 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6891 register const Py_UNICODE *e;
6892
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 /* Shortcut for single character strings */
6894 if (PyUnicode_GET_SIZE(self) == 1 &&
6895 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006896 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006898 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006899 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006901
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 e = p + PyUnicode_GET_SIZE(self);
6903 for (; p < e; p++) {
6904 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006907 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908}
6909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006911"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006912\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006913Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006914and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006915
6916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006917unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006918{
6919 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6920 register const Py_UNICODE *e;
6921
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006922 /* Shortcut for single character strings */
6923 if (PyUnicode_GET_SIZE(self) == 1 &&
6924 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006925 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006926
6927 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006928 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006929 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006930
6931 e = p + PyUnicode_GET_SIZE(self);
6932 for (; p < e; p++) {
6933 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006935 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006937}
6938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006939PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006940"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006941\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006942Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006943and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006944
6945static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006946unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006947{
6948 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6949 register const Py_UNICODE *e;
6950
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006951 /* Shortcut for single character strings */
6952 if (PyUnicode_GET_SIZE(self) == 1 &&
6953 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006954 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006955
6956 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006957 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006958 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006959
6960 e = p + PyUnicode_GET_SIZE(self);
6961 for (; p < e; p++) {
6962 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006964 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006965 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006966}
6967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006968PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006969"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006971Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006972False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973
6974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006975unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976{
6977 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6978 register const Py_UNICODE *e;
6979
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 /* Shortcut for single character strings */
6981 if (PyUnicode_GET_SIZE(self) == 1 &&
6982 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006983 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006985 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006986 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006987 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006988
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 e = p + PyUnicode_GET_SIZE(self);
6990 for (; p < e; p++) {
6991 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006992 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006994 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995}
6996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006997PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006998"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007000Return True if all characters in S are digits\n\
7001and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002
7003static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007004unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005{
7006 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7007 register const Py_UNICODE *e;
7008
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 /* Shortcut for single character strings */
7010 if (PyUnicode_GET_SIZE(self) == 1 &&
7011 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007012 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007014 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007015 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007016 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007017
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 e = p + PyUnicode_GET_SIZE(self);
7019 for (; p < e; p++) {
7020 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007021 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007023 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024}
7025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007026PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007027"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007029Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007030False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
7032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007033unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034{
7035 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7036 register const Py_UNICODE *e;
7037
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 /* Shortcut for single character strings */
7039 if (PyUnicode_GET_SIZE(self) == 1 &&
7040 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007041 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007043 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007044 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007045 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007046
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 e = p + PyUnicode_GET_SIZE(self);
7048 for (; p < e; p++) {
7049 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007050 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007052 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053}
7054
Martin v. Löwis47383402007-08-15 07:32:56 +00007055int
7056PyUnicode_IsIdentifier(PyObject *self)
7057{
7058 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7059 register const Py_UNICODE *e;
7060
7061 /* Special case for empty strings */
7062 if (PyUnicode_GET_SIZE(self) == 0)
7063 return 0;
7064
7065 /* PEP 3131 says that the first character must be in
7066 XID_Start and subsequent characters in XID_Continue,
7067 and for the ASCII range, the 2.x rules apply (i.e
7068 start with letters and underscore, continue with
7069 letters, digits, underscore). However, given the current
7070 definition of XID_Start and XID_Continue, it is sufficient
7071 to check just for these, except that _ must be allowed
7072 as starting an identifier. */
7073 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7074 return 0;
7075
7076 e = p + PyUnicode_GET_SIZE(self);
7077 for (p++; p < e; p++) {
7078 if (!_PyUnicode_IsXidContinue(*p))
7079 return 0;
7080 }
7081 return 1;
7082}
7083
7084PyDoc_STRVAR(isidentifier__doc__,
7085"S.isidentifier() -> bool\n\
7086\n\
7087Return True if S is a valid identifier according\n\
7088to the language definition.");
7089
7090static PyObject*
7091unicode_isidentifier(PyObject *self)
7092{
7093 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7094}
7095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007096PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097"S.join(sequence) -> unicode\n\
7098\n\
7099Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007100sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101
7102static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007103unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007105 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106}
7107
Martin v. Löwis18e16552006-02-15 17:27:45 +00007108static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109unicode_length(PyUnicodeObject *self)
7110{
7111 return self->length;
7112}
7113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007114PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007115"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116\n\
7117Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007118done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119
7120static PyObject *
7121unicode_ljust(PyUnicodeObject *self, PyObject *args)
7122{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007123 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007124 Py_UNICODE fillchar = ' ';
7125
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007126 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127 return NULL;
7128
Tim Peters7a29bd52001-09-12 03:03:31 +00007129 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 Py_INCREF(self);
7131 return (PyObject*) self;
7132 }
7133
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007134 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135}
7136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007137PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138"S.lower() -> unicode\n\
7139\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007140Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
7142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007143unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 return fixup(self, fixlower);
7146}
7147
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007148#define LEFTSTRIP 0
7149#define RIGHTSTRIP 1
7150#define BOTHSTRIP 2
7151
7152/* Arrays indexed by above */
7153static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7154
7155#define STRIPNAME(i) (stripformat[i]+3)
7156
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007157/* externally visible for str.strip(unicode) */
7158PyObject *
7159_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7160{
7161 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007162 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007163 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007164 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7165 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007166
Thomas Wouters477c8d52006-05-27 19:21:47 +00007167 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7168
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007169 i = 0;
7170 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007171 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7172 i++;
7173 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007174 }
7175
7176 j = len;
7177 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007178 do {
7179 j--;
7180 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7181 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007182 }
7183
7184 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007185 Py_INCREF(self);
7186 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007187 }
7188 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007189 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007190}
7191
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
7193static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007194do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007196 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007197 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007198
7199 i = 0;
7200 if (striptype != RIGHTSTRIP) {
7201 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7202 i++;
7203 }
7204 }
7205
7206 j = len;
7207 if (striptype != LEFTSTRIP) {
7208 do {
7209 j--;
7210 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7211 j++;
7212 }
7213
7214 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7215 Py_INCREF(self);
7216 return (PyObject*)self;
7217 }
7218 else
7219 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220}
7221
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007222
7223static PyObject *
7224do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7225{
7226 PyObject *sep = NULL;
7227
7228 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7229 return NULL;
7230
7231 if (sep != NULL && sep != Py_None) {
7232 if (PyUnicode_Check(sep))
7233 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007234 else {
7235 PyErr_Format(PyExc_TypeError,
7236 "%s arg must be None, unicode or str",
7237 STRIPNAME(striptype));
7238 return NULL;
7239 }
7240 }
7241
7242 return do_strip(self, striptype);
7243}
7244
7245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007246PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007247"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007248\n\
7249Return a copy of the string S with leading and trailing\n\
7250whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007251If chars is given and not None, remove characters in chars instead.\n\
7252If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007253
7254static PyObject *
7255unicode_strip(PyUnicodeObject *self, PyObject *args)
7256{
7257 if (PyTuple_GET_SIZE(args) == 0)
7258 return do_strip(self, BOTHSTRIP); /* Common case */
7259 else
7260 return do_argstrip(self, BOTHSTRIP, args);
7261}
7262
7263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007264PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007265"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007266\n\
7267Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007268If chars is given and not None, remove characters in chars instead.\n\
7269If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007270
7271static PyObject *
7272unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7273{
7274 if (PyTuple_GET_SIZE(args) == 0)
7275 return do_strip(self, LEFTSTRIP); /* Common case */
7276 else
7277 return do_argstrip(self, LEFTSTRIP, args);
7278}
7279
7280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007281PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007282"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007283\n\
7284Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007285If chars is given and not None, remove characters in chars instead.\n\
7286If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007287
7288static PyObject *
7289unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7290{
7291 if (PyTuple_GET_SIZE(args) == 0)
7292 return do_strip(self, RIGHTSTRIP); /* Common case */
7293 else
7294 return do_argstrip(self, RIGHTSTRIP, args);
7295}
7296
7297
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007299unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300{
7301 PyUnicodeObject *u;
7302 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007303 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007304 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305
7306 if (len < 0)
7307 len = 0;
7308
Tim Peters7a29bd52001-09-12 03:03:31 +00007309 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 /* no repeat, return original string */
7311 Py_INCREF(str);
7312 return (PyObject*) str;
7313 }
Tim Peters8f422462000-09-09 06:13:41 +00007314
7315 /* ensure # of chars needed doesn't overflow int and # of bytes
7316 * needed doesn't overflow size_t
7317 */
7318 nchars = len * str->length;
7319 if (len && nchars / len != str->length) {
7320 PyErr_SetString(PyExc_OverflowError,
7321 "repeated string is too long");
7322 return NULL;
7323 }
7324 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7325 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7326 PyErr_SetString(PyExc_OverflowError,
7327 "repeated string is too long");
7328 return NULL;
7329 }
7330 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331 if (!u)
7332 return NULL;
7333
7334 p = u->str;
7335
Thomas Wouters477c8d52006-05-27 19:21:47 +00007336 if (str->length == 1 && len > 0) {
7337 Py_UNICODE_FILL(p, str->str[0], len);
7338 } else {
7339 Py_ssize_t done = 0; /* number of characters copied this far */
7340 if (done < nchars) {
7341 Py_UNICODE_COPY(p, str->str, str->length);
7342 done = str->length;
7343 }
7344 while (done < nchars) {
7345 int n = (done <= nchars-done) ? done : nchars-done;
7346 Py_UNICODE_COPY(p+done, p, n);
7347 done += n;
7348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 }
7350
7351 return (PyObject*) u;
7352}
7353
7354PyObject *PyUnicode_Replace(PyObject *obj,
7355 PyObject *subobj,
7356 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007357 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358{
7359 PyObject *self;
7360 PyObject *str1;
7361 PyObject *str2;
7362 PyObject *result;
7363
7364 self = PyUnicode_FromObject(obj);
7365 if (self == NULL)
7366 return NULL;
7367 str1 = PyUnicode_FromObject(subobj);
7368 if (str1 == NULL) {
7369 Py_DECREF(self);
7370 return NULL;
7371 }
7372 str2 = PyUnicode_FromObject(replobj);
7373 if (str2 == NULL) {
7374 Py_DECREF(self);
7375 Py_DECREF(str1);
7376 return NULL;
7377 }
Tim Petersced69f82003-09-16 20:30:58 +00007378 result = replace((PyUnicodeObject *)self,
7379 (PyUnicodeObject *)str1,
7380 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 maxcount);
7382 Py_DECREF(self);
7383 Py_DECREF(str1);
7384 Py_DECREF(str2);
7385 return result;
7386}
7387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007388PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389"S.replace (old, new[, maxsplit]) -> unicode\n\
7390\n\
7391Return a copy of S with all occurrences of substring\n\
7392old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007393given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394
7395static PyObject*
7396unicode_replace(PyUnicodeObject *self, PyObject *args)
7397{
7398 PyUnicodeObject *str1;
7399 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007400 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401 PyObject *result;
7402
Martin v. Löwis18e16552006-02-15 17:27:45 +00007403 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 return NULL;
7405 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7406 if (str1 == NULL)
7407 return NULL;
7408 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007409 if (str2 == NULL) {
7410 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413
7414 result = replace(self, str1, str2, maxcount);
7415
7416 Py_DECREF(str1);
7417 Py_DECREF(str2);
7418 return result;
7419}
7420
7421static
7422PyObject *unicode_repr(PyObject *unicode)
7423{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007424 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007425 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007426 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7427 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7428
7429 /* XXX(nnorwitz): rather than over-allocating, it would be
7430 better to choose a different scheme. Perhaps scan the
7431 first N-chars of the string and allocate based on that size.
7432 */
7433 /* Initial allocation is based on the longest-possible unichr
7434 escape.
7435
7436 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7437 unichr, so in this case it's the longest unichr escape. In
7438 narrow (UTF-16) builds this is five chars per source unichr
7439 since there are two unichrs in the surrogate pair, so in narrow
7440 (UTF-16) builds it's not the longest unichr escape.
7441
7442 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7443 so in the narrow (UTF-16) build case it's the longest unichr
7444 escape.
7445 */
7446
Walter Dörwald1ab83302007-05-18 17:15:44 +00007447 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007448 2 /* quotes */
7449#ifdef Py_UNICODE_WIDE
7450 + 10*size
7451#else
7452 + 6*size
7453#endif
7454 + 1);
7455 if (repr == NULL)
7456 return NULL;
7457
Walter Dörwald1ab83302007-05-18 17:15:44 +00007458 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007459
7460 /* Add quote */
7461 *p++ = (findchar(s, size, '\'') &&
7462 !findchar(s, size, '"')) ? '"' : '\'';
7463 while (size-- > 0) {
7464 Py_UNICODE ch = *s++;
7465
7466 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007467 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007468 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007469 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007470 continue;
7471 }
7472
7473#ifdef Py_UNICODE_WIDE
7474 /* Map 21-bit characters to '\U00xxxxxx' */
7475 else if (ch >= 0x10000) {
7476 *p++ = '\\';
7477 *p++ = 'U';
7478 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7479 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7480 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7481 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7482 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7483 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7484 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7485 *p++ = hexdigits[ch & 0x0000000F];
7486 continue;
7487 }
7488#else
7489 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7490 else if (ch >= 0xD800 && ch < 0xDC00) {
7491 Py_UNICODE ch2;
7492 Py_UCS4 ucs;
7493
7494 ch2 = *s++;
7495 size--;
7496 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7497 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7498 *p++ = '\\';
7499 *p++ = 'U';
7500 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7501 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7502 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7503 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7504 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7505 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7506 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7507 *p++ = hexdigits[ucs & 0x0000000F];
7508 continue;
7509 }
7510 /* Fall through: isolated surrogates are copied as-is */
7511 s--;
7512 size++;
7513 }
7514#endif
7515
7516 /* Map 16-bit characters to '\uxxxx' */
7517 if (ch >= 256) {
7518 *p++ = '\\';
7519 *p++ = 'u';
7520 *p++ = hexdigits[(ch >> 12) & 0x000F];
7521 *p++ = hexdigits[(ch >> 8) & 0x000F];
7522 *p++ = hexdigits[(ch >> 4) & 0x000F];
7523 *p++ = hexdigits[ch & 0x000F];
7524 }
7525
7526 /* Map special whitespace to '\t', \n', '\r' */
7527 else if (ch == '\t') {
7528 *p++ = '\\';
7529 *p++ = 't';
7530 }
7531 else if (ch == '\n') {
7532 *p++ = '\\';
7533 *p++ = 'n';
7534 }
7535 else if (ch == '\r') {
7536 *p++ = '\\';
7537 *p++ = 'r';
7538 }
7539
7540 /* Map non-printable US ASCII to '\xhh' */
7541 else if (ch < ' ' || ch >= 0x7F) {
7542 *p++ = '\\';
7543 *p++ = 'x';
7544 *p++ = hexdigits[(ch >> 4) & 0x000F];
7545 *p++ = hexdigits[ch & 0x000F];
7546 }
7547
7548 /* Copy everything else as-is */
7549 else
7550 *p++ = (char) ch;
7551 }
7552 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007553 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007554
7555 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007556 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007557 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558}
7559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007560PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561"S.rfind(sub [,start [,end]]) -> int\n\
7562\n\
7563Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007564such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565arguments start and end are interpreted as in slice notation.\n\
7566\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007567Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568
7569static PyObject *
7570unicode_rfind(PyUnicodeObject *self, PyObject *args)
7571{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007572 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007573 Py_ssize_t start;
7574 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007575 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
Christian Heimes9cd17752007-11-18 19:35:23 +00007577 if (!_ParseTupleFinds(args, &substring, &start, &end))
7578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579
Thomas Wouters477c8d52006-05-27 19:21:47 +00007580 result = stringlib_rfind_slice(
7581 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7582 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7583 start, end
7584 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585
7586 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007587
Christian Heimes217cfd12007-12-02 14:31:20 +00007588 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589}
7590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007591PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592"S.rindex(sub [,start [,end]]) -> int\n\
7593\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007594Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595
7596static PyObject *
7597unicode_rindex(PyUnicodeObject *self, PyObject *args)
7598{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007599 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007600 Py_ssize_t start;
7601 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007602 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603
Christian Heimes9cd17752007-11-18 19:35:23 +00007604 if (!_ParseTupleFinds(args, &substring, &start, &end))
7605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606
Thomas Wouters477c8d52006-05-27 19:21:47 +00007607 result = stringlib_rfind_slice(
7608 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7609 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7610 start, end
7611 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612
7613 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007614
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 if (result < 0) {
7616 PyErr_SetString(PyExc_ValueError, "substring not found");
7617 return NULL;
7618 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007619 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620}
7621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007622PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007623"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624\n\
7625Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007626done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
7628static PyObject *
7629unicode_rjust(PyUnicodeObject *self, PyObject *args)
7630{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007631 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007632 Py_UNICODE fillchar = ' ';
7633
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007634 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635 return NULL;
7636
Tim Peters7a29bd52001-09-12 03:03:31 +00007637 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 Py_INCREF(self);
7639 return (PyObject*) self;
7640 }
7641
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007642 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645PyObject *PyUnicode_Split(PyObject *s,
7646 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007647 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648{
7649 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007650
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651 s = PyUnicode_FromObject(s);
7652 if (s == NULL)
7653 return NULL;
7654 if (sep != NULL) {
7655 sep = PyUnicode_FromObject(sep);
7656 if (sep == NULL) {
7657 Py_DECREF(s);
7658 return NULL;
7659 }
7660 }
7661
7662 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7663
7664 Py_DECREF(s);
7665 Py_XDECREF(sep);
7666 return result;
7667}
7668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007669PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670"S.split([sep [,maxsplit]]) -> list of strings\n\
7671\n\
7672Return a list of the words in S, using sep as the\n\
7673delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007674splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007675any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676
7677static PyObject*
7678unicode_split(PyUnicodeObject *self, PyObject *args)
7679{
7680 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007681 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682
Martin v. Löwis18e16552006-02-15 17:27:45 +00007683 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 return NULL;
7685
7686 if (substring == Py_None)
7687 return split(self, NULL, maxcount);
7688 else if (PyUnicode_Check(substring))
7689 return split(self, (PyUnicodeObject *)substring, maxcount);
7690 else
7691 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7692}
7693
Thomas Wouters477c8d52006-05-27 19:21:47 +00007694PyObject *
7695PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7696{
7697 PyObject* str_obj;
7698 PyObject* sep_obj;
7699 PyObject* out;
7700
7701 str_obj = PyUnicode_FromObject(str_in);
7702 if (!str_obj)
7703 return NULL;
7704 sep_obj = PyUnicode_FromObject(sep_in);
7705 if (!sep_obj) {
7706 Py_DECREF(str_obj);
7707 return NULL;
7708 }
7709
7710 out = stringlib_partition(
7711 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7712 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7713 );
7714
7715 Py_DECREF(sep_obj);
7716 Py_DECREF(str_obj);
7717
7718 return out;
7719}
7720
7721
7722PyObject *
7723PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7724{
7725 PyObject* str_obj;
7726 PyObject* sep_obj;
7727 PyObject* out;
7728
7729 str_obj = PyUnicode_FromObject(str_in);
7730 if (!str_obj)
7731 return NULL;
7732 sep_obj = PyUnicode_FromObject(sep_in);
7733 if (!sep_obj) {
7734 Py_DECREF(str_obj);
7735 return NULL;
7736 }
7737
7738 out = stringlib_rpartition(
7739 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7740 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7741 );
7742
7743 Py_DECREF(sep_obj);
7744 Py_DECREF(str_obj);
7745
7746 return out;
7747}
7748
7749PyDoc_STRVAR(partition__doc__,
7750"S.partition(sep) -> (head, sep, tail)\n\
7751\n\
7752Searches for the separator sep in S, and returns the part before it,\n\
7753the separator itself, and the part after it. If the separator is not\n\
7754found, returns S and two empty strings.");
7755
7756static PyObject*
7757unicode_partition(PyUnicodeObject *self, PyObject *separator)
7758{
7759 return PyUnicode_Partition((PyObject *)self, separator);
7760}
7761
7762PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007763"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007764\n\
7765Searches for the separator sep in S, starting at the end of S, and returns\n\
7766the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007767separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007768
7769static PyObject*
7770unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7771{
7772 return PyUnicode_RPartition((PyObject *)self, separator);
7773}
7774
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007775PyObject *PyUnicode_RSplit(PyObject *s,
7776 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007777 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007778{
7779 PyObject *result;
7780
7781 s = PyUnicode_FromObject(s);
7782 if (s == NULL)
7783 return NULL;
7784 if (sep != NULL) {
7785 sep = PyUnicode_FromObject(sep);
7786 if (sep == NULL) {
7787 Py_DECREF(s);
7788 return NULL;
7789 }
7790 }
7791
7792 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7793
7794 Py_DECREF(s);
7795 Py_XDECREF(sep);
7796 return result;
7797}
7798
7799PyDoc_STRVAR(rsplit__doc__,
7800"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7801\n\
7802Return a list of the words in S, using sep as the\n\
7803delimiter string, starting at the end of the string and\n\
7804working to the front. If maxsplit is given, at most maxsplit\n\
7805splits are done. If sep is not specified, any whitespace string\n\
7806is a separator.");
7807
7808static PyObject*
7809unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7810{
7811 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007812 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007813
Martin v. Löwis18e16552006-02-15 17:27:45 +00007814 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007815 return NULL;
7816
7817 if (substring == Py_None)
7818 return rsplit(self, NULL, maxcount);
7819 else if (PyUnicode_Check(substring))
7820 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7821 else
7822 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7823}
7824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007825PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007826"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827\n\
7828Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007829Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007830is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831
7832static PyObject*
7833unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7834{
Guido van Rossum86662912000-04-11 15:38:46 +00007835 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836
Guido van Rossum86662912000-04-11 15:38:46 +00007837 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 return NULL;
7839
Guido van Rossum86662912000-04-11 15:38:46 +00007840 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841}
7842
7843static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007844PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845{
Walter Dörwald346737f2007-05-31 10:44:43 +00007846 if (PyUnicode_CheckExact(self)) {
7847 Py_INCREF(self);
7848 return self;
7849 } else
7850 /* Subtype -- return genuine unicode string with the same value. */
7851 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7852 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853}
7854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007855PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856"S.swapcase() -> unicode\n\
7857\n\
7858Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007859and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860
7861static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007862unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 return fixup(self, fixswapcase);
7865}
7866
Georg Brandlceee0772007-11-27 23:48:05 +00007867PyDoc_STRVAR(maketrans__doc__,
7868"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
7869\n\
7870Return a translation table usable for str.translate().\n\
7871If there is only one argument, it must be a dictionary mapping Unicode\n\
7872ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
7873Character keys will then be converted to ordinals.\n\
7874If there are two arguments, they must be strings of equal length, and\n\
7875in the resulting dictionary, each character in x will be mapped to the\n\
7876character at the same position in y. If there is a third argument, it\n\
7877must be a string, whose characters will be mapped to None in the result.");
7878
7879static PyObject*
7880unicode_maketrans(PyUnicodeObject *null, PyObject *args)
7881{
7882 PyObject *x, *y = NULL, *z = NULL;
7883 PyObject *new = NULL, *key, *value;
7884 Py_ssize_t i = 0;
7885 int res;
7886
7887 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
7888 return NULL;
7889 new = PyDict_New();
7890 if (!new)
7891 return NULL;
7892 if (y != NULL) {
7893 /* x must be a string too, of equal length */
7894 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
7895 if (!PyUnicode_Check(x)) {
7896 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
7897 "be a string if there is a second argument");
7898 goto err;
7899 }
7900 if (PyUnicode_GET_SIZE(x) != ylen) {
7901 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
7902 "arguments must have equal length");
7903 goto err;
7904 }
7905 /* create entries for translating chars in x to those in y */
7906 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007907 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
7908 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007909 if (!key || !value)
7910 goto err;
7911 res = PyDict_SetItem(new, key, value);
7912 Py_DECREF(key);
7913 Py_DECREF(value);
7914 if (res < 0)
7915 goto err;
7916 }
7917 /* create entries for deleting chars in z */
7918 if (z != NULL) {
7919 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007920 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007921 if (!key)
7922 goto err;
7923 res = PyDict_SetItem(new, key, Py_None);
7924 Py_DECREF(key);
7925 if (res < 0)
7926 goto err;
7927 }
7928 }
7929 } else {
7930 /* x must be a dict */
7931 if (!PyDict_Check(x)) {
7932 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
7933 "to maketrans it must be a dict");
7934 goto err;
7935 }
7936 /* copy entries into the new dict, converting string keys to int keys */
7937 while (PyDict_Next(x, &i, &key, &value)) {
7938 if (PyUnicode_Check(key)) {
7939 /* convert string keys to integer keys */
7940 PyObject *newkey;
7941 if (PyUnicode_GET_SIZE(key) != 1) {
7942 PyErr_SetString(PyExc_ValueError, "string keys in translate "
7943 "table must be of length 1");
7944 goto err;
7945 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007946 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00007947 if (!newkey)
7948 goto err;
7949 res = PyDict_SetItem(new, newkey, value);
7950 Py_DECREF(newkey);
7951 if (res < 0)
7952 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00007953 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00007954 /* just keep integer keys */
7955 if (PyDict_SetItem(new, key, value) < 0)
7956 goto err;
7957 } else {
7958 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
7959 "be strings or integers");
7960 goto err;
7961 }
7962 }
7963 }
7964 return new;
7965 err:
7966 Py_DECREF(new);
7967 return NULL;
7968}
7969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007970PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971"S.translate(table) -> unicode\n\
7972\n\
7973Return a copy of the string S, where all characters have been mapped\n\
7974through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007975Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7976Unmapped characters are left untouched. Characters mapped to None\n\
7977are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978
7979static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007980unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981{
Georg Brandlceee0772007-11-27 23:48:05 +00007982 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983}
7984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007985PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986"S.upper() -> unicode\n\
7987\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007988Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989
7990static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007991unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 return fixup(self, fixupper);
7994}
7995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007996PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997"S.zfill(width) -> unicode\n\
7998\n\
7999Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008000of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001
8002static PyObject *
8003unicode_zfill(PyUnicodeObject *self, PyObject *args)
8004{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008005 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 PyUnicodeObject *u;
8007
Martin v. Löwis18e16552006-02-15 17:27:45 +00008008 Py_ssize_t width;
8009 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 return NULL;
8011
8012 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008013 if (PyUnicode_CheckExact(self)) {
8014 Py_INCREF(self);
8015 return (PyObject*) self;
8016 }
8017 else
8018 return PyUnicode_FromUnicode(
8019 PyUnicode_AS_UNICODE(self),
8020 PyUnicode_GET_SIZE(self)
8021 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 }
8023
8024 fill = width - self->length;
8025
8026 u = pad(self, fill, 0, '0');
8027
Walter Dörwald068325e2002-04-15 13:36:47 +00008028 if (u == NULL)
8029 return NULL;
8030
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 if (u->str[fill] == '+' || u->str[fill] == '-') {
8032 /* move sign to beginning of string */
8033 u->str[0] = u->str[fill];
8034 u->str[fill] = '0';
8035 }
8036
8037 return (PyObject*) u;
8038}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039
8040#if 0
8041static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008042unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043{
Christian Heimes2202f872008-02-06 14:31:34 +00008044 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045}
8046#endif
8047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008048PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008049"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008051Return True if S starts with the specified prefix, False otherwise.\n\
8052With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008053With optional end, stop comparing S at that position.\n\
8054prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055
8056static PyObject *
8057unicode_startswith(PyUnicodeObject *self,
8058 PyObject *args)
8059{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008060 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008062 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008063 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008064 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008066 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008067 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008069 if (PyTuple_Check(subobj)) {
8070 Py_ssize_t i;
8071 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8072 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8073 PyTuple_GET_ITEM(subobj, i));
8074 if (substring == NULL)
8075 return NULL;
8076 result = tailmatch(self, substring, start, end, -1);
8077 Py_DECREF(substring);
8078 if (result) {
8079 Py_RETURN_TRUE;
8080 }
8081 }
8082 /* nothing matched */
8083 Py_RETURN_FALSE;
8084 }
8085 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008087 return NULL;
8088 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008090 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091}
8092
8093
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008094PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008095"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008097Return True if S ends with the specified suffix, False otherwise.\n\
8098With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008099With optional end, stop comparing S at that position.\n\
8100suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101
8102static PyObject *
8103unicode_endswith(PyUnicodeObject *self,
8104 PyObject *args)
8105{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008106 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008108 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008109 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008110 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008112 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8113 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008115 if (PyTuple_Check(subobj)) {
8116 Py_ssize_t i;
8117 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8118 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8119 PyTuple_GET_ITEM(subobj, i));
8120 if (substring == NULL)
8121 return NULL;
8122 result = tailmatch(self, substring, start, end, +1);
8123 Py_DECREF(substring);
8124 if (result) {
8125 Py_RETURN_TRUE;
8126 }
8127 }
8128 Py_RETURN_FALSE;
8129 }
8130 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008134 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008136 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137}
8138
Eric Smith8c663262007-08-25 02:26:07 +00008139#include "stringlib/string_format.h"
8140
8141PyDoc_STRVAR(format__doc__,
8142"S.format(*args, **kwargs) -> unicode\n\
8143\n\
8144");
8145
Eric Smith8c663262007-08-25 02:26:07 +00008146PyDoc_STRVAR(p_format__doc__,
8147"S.__format__(format_spec) -> unicode\n\
8148\n\
8149");
8150
8151static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008152unicode_getnewargs(PyUnicodeObject *v)
8153{
8154 return Py_BuildValue("(u#)", v->str, v->length);
8155}
8156
8157
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158static PyMethodDef unicode_methods[] = {
8159
8160 /* Order is according to common usage: often used methods should
8161 appear first, since lookup is done sequentially. */
8162
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008163 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8164 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8165 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008166 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008167 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8168 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8169 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8170 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8171 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8172 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8173 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008174 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008175 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8176 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8177 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008178 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008179 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8180 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8181 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008182 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008183 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008184 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008185 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008186 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8187 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8188 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8189 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8190 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8191 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8192 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8193 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8194 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8195 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8196 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8197 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8198 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8199 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008200 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008201 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008202 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8203 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008204 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8205 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008206 {"maketrans", (PyCFunction) unicode_maketrans,
8207 METH_VARARGS | METH_STATIC, maketrans__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008208#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008209 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210#endif
8211
8212#if 0
8213 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008214 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215#endif
8216
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008217 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218 {NULL, NULL}
8219};
8220
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008221static PyObject *
8222unicode_mod(PyObject *v, PyObject *w)
8223{
8224 if (!PyUnicode_Check(v)) {
8225 Py_INCREF(Py_NotImplemented);
8226 return Py_NotImplemented;
8227 }
8228 return PyUnicode_Format(v, w);
8229}
8230
8231static PyNumberMethods unicode_as_number = {
8232 0, /*nb_add*/
8233 0, /*nb_subtract*/
8234 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008235 unicode_mod, /*nb_remainder*/
8236};
8237
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008239 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008240 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008241 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8242 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008243 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 0, /* sq_ass_item */
8245 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008246 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247};
8248
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008249static PyObject*
8250unicode_subscript(PyUnicodeObject* self, PyObject* item)
8251{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008252 if (PyIndex_Check(item)) {
8253 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008254 if (i == -1 && PyErr_Occurred())
8255 return NULL;
8256 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008257 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008258 return unicode_getitem(self, i);
8259 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008260 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008261 Py_UNICODE* source_buf;
8262 Py_UNICODE* result_buf;
8263 PyObject* result;
8264
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008265 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008266 &start, &stop, &step, &slicelength) < 0) {
8267 return NULL;
8268 }
8269
8270 if (slicelength <= 0) {
8271 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008272 } else if (start == 0 && step == 1 && slicelength == self->length &&
8273 PyUnicode_CheckExact(self)) {
8274 Py_INCREF(self);
8275 return (PyObject *)self;
8276 } else if (step == 1) {
8277 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008278 } else {
8279 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008280 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8281 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008282
8283 if (result_buf == NULL)
8284 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008285
8286 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8287 result_buf[i] = source_buf[cur];
8288 }
Tim Petersced69f82003-09-16 20:30:58 +00008289
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008290 result = PyUnicode_FromUnicode(result_buf, slicelength);
8291 PyMem_FREE(result_buf);
8292 return result;
8293 }
8294 } else {
8295 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8296 return NULL;
8297 }
8298}
8299
8300static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008301 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008302 (binaryfunc)unicode_subscript, /* mp_subscript */
8303 (objobjargproc)0, /* mp_ass_subscript */
8304};
8305
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307/* Helpers for PyUnicode_Format() */
8308
8309static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008310getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008312 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 if (argidx < arglen) {
8314 (*p_argidx)++;
8315 if (arglen < 0)
8316 return args;
8317 else
8318 return PyTuple_GetItem(args, argidx);
8319 }
8320 PyErr_SetString(PyExc_TypeError,
8321 "not enough arguments for format string");
8322 return NULL;
8323}
8324
Martin v. Löwis18e16552006-02-15 17:27:45 +00008325static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008326strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008328 register Py_ssize_t i;
8329 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 for (i = len - 1; i >= 0; i--)
8331 buffer[i] = (Py_UNICODE) charbuffer[i];
8332
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 return len;
8334}
8335
Neal Norwitzfc76d632006-01-10 06:03:13 +00008336static int
8337doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8338{
Tim Peters15231542006-02-16 01:08:01 +00008339 Py_ssize_t result;
8340
Neal Norwitzfc76d632006-01-10 06:03:13 +00008341 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008342 result = strtounicode(buffer, (char *)buffer);
8343 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008344}
8345
8346static int
8347longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8348{
Tim Peters15231542006-02-16 01:08:01 +00008349 Py_ssize_t result;
8350
Neal Norwitzfc76d632006-01-10 06:03:13 +00008351 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008352 result = strtounicode(buffer, (char *)buffer);
8353 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008354}
8355
Guido van Rossum078151d2002-08-11 04:24:12 +00008356/* XXX To save some code duplication, formatfloat/long/int could have been
8357 shared with stringobject.c, converting from 8-bit to Unicode after the
8358 formatting is done. */
8359
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360static int
8361formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008362 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 int flags,
8364 int prec,
8365 int type,
8366 PyObject *v)
8367{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008368 /* fmt = '%#.' + `prec` + `type`
8369 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 char fmt[20];
8371 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008372
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 x = PyFloat_AsDouble(v);
8374 if (x == -1.0 && PyErr_Occurred())
8375 return -1;
8376 if (prec < 0)
8377 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8379 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008380 /* Worst case length calc to ensure no buffer overrun:
8381
8382 'g' formats:
8383 fmt = %#.<prec>g
8384 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8385 for any double rep.)
8386 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8387
8388 'f' formats:
8389 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8390 len = 1 + 50 + 1 + prec = 52 + prec
8391
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008392 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008393 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008394
8395 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008396 if (((type == 'g' || type == 'G') &&
8397 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008398 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008399 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008400 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008401 return -1;
8402 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008403 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8404 (flags&F_ALT) ? "#" : "",
8405 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008406 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407}
8408
Tim Peters38fd5b62000-09-21 05:43:11 +00008409static PyObject*
8410formatlong(PyObject *val, int flags, int prec, int type)
8411{
8412 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008413 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008414 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008415 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008416
8417 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8418 if (!str)
8419 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008420 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008421 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008422 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008423}
8424
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425static int
8426formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008427 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 int flags,
8429 int prec,
8430 int type,
8431 PyObject *v)
8432{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008433 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008434 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8435 * + 1 + 1
8436 * = 24
8437 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008438 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008439 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 long x;
8441
Christian Heimes217cfd12007-12-02 14:31:20 +00008442 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008444 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008445 if (x < 0 && type == 'u') {
8446 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008447 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008448 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8449 sign = "-";
8450 else
8451 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008453 prec = 1;
8454
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008455 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8456 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008457 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008458 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008459 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008460 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008461 return -1;
8462 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008463
8464 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008465 (type == 'x' || type == 'X' || type == 'o')) {
8466 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008467 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008468 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008469 * - when 0 is being converted, the C standard leaves off
8470 * the '0x' or '0X', which is inconsistent with other
8471 * %#x/%#X conversions and inconsistent with Python's
8472 * hex() function
8473 * - there are platforms that violate the standard and
8474 * convert 0 with the '0x' or '0X'
8475 * (Metrowerks, Compaq Tru64)
8476 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008477 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008478 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008479 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008480 * We can achieve the desired consistency by inserting our
8481 * own '0x' or '0X' prefix, and substituting %x/%X in place
8482 * of %#x/%#X.
8483 *
8484 * Note that this is the same approach as used in
8485 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008486 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008487 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8488 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008489 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008490 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008491 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8492 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008493 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008494 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008495 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008496 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008497 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008498 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499}
8500
8501static int
8502formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008503 size_t buflen,
8504 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008506 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008507 if (PyUnicode_Check(v)) {
8508 if (PyUnicode_GET_SIZE(v) != 1)
8509 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 else {
8513 /* Integer input truncated to a character */
8514 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008515 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008517 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008518#ifdef Py_UNICODE_WIDE
8519 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008520 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008521 "%c arg not in range(0x110000) "
8522 "(wide Python build)");
8523 return -1;
8524 }
8525#else
8526 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008527 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008528 "%c arg not in range(0x10000) "
8529 "(narrow Python build)");
8530 return -1;
8531 }
8532#endif
8533 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 }
8535 buf[1] = '\0';
8536 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008537
8538 onError:
8539 PyErr_SetString(PyExc_TypeError,
8540 "%c requires int or char");
8541 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542}
8543
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008544/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8545
8546 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8547 chars are formatted. XXX This is a magic number. Each formatting
8548 routine does bounds checking to ensure no overflow, but a better
8549 solution may be to malloc a buffer of appropriate size for each
8550 format. For now, the current solution is sufficient.
8551*/
8552#define FORMATBUFLEN (size_t)120
8553
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554PyObject *PyUnicode_Format(PyObject *format,
8555 PyObject *args)
8556{
8557 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008558 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559 int args_owned = 0;
8560 PyUnicodeObject *result = NULL;
8561 PyObject *dict = NULL;
8562 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008563
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 if (format == NULL || args == NULL) {
8565 PyErr_BadInternalCall();
8566 return NULL;
8567 }
8568 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008569 if (uformat == NULL)
8570 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 fmt = PyUnicode_AS_UNICODE(uformat);
8572 fmtcnt = PyUnicode_GET_SIZE(uformat);
8573
8574 reslen = rescnt = fmtcnt + 100;
8575 result = _PyUnicode_New(reslen);
8576 if (result == NULL)
8577 goto onError;
8578 res = PyUnicode_AS_UNICODE(result);
8579
8580 if (PyTuple_Check(args)) {
8581 arglen = PyTuple_Size(args);
8582 argidx = 0;
8583 }
8584 else {
8585 arglen = -1;
8586 argidx = -2;
8587 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008588 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008589 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 dict = args;
8591
8592 while (--fmtcnt >= 0) {
8593 if (*fmt != '%') {
8594 if (--rescnt < 0) {
8595 rescnt = fmtcnt + 100;
8596 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008597 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008598 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8600 --rescnt;
8601 }
8602 *res++ = *fmt++;
8603 }
8604 else {
8605 /* Got a format specifier */
8606 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008607 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609 Py_UNICODE c = '\0';
8610 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008611 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 PyObject *v = NULL;
8613 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008614 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008616 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008617 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618
8619 fmt++;
8620 if (*fmt == '(') {
8621 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008622 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 PyObject *key;
8624 int pcount = 1;
8625
8626 if (dict == NULL) {
8627 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008628 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 goto onError;
8630 }
8631 ++fmt;
8632 --fmtcnt;
8633 keystart = fmt;
8634 /* Skip over balanced parentheses */
8635 while (pcount > 0 && --fmtcnt >= 0) {
8636 if (*fmt == ')')
8637 --pcount;
8638 else if (*fmt == '(')
8639 ++pcount;
8640 fmt++;
8641 }
8642 keylen = fmt - keystart - 1;
8643 if (fmtcnt < 0 || pcount > 0) {
8644 PyErr_SetString(PyExc_ValueError,
8645 "incomplete format key");
8646 goto onError;
8647 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008648#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008649 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 then looked up since Python uses strings to hold
8651 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008652 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 key = PyUnicode_EncodeUTF8(keystart,
8654 keylen,
8655 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008656#else
8657 key = PyUnicode_FromUnicode(keystart, keylen);
8658#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 if (key == NULL)
8660 goto onError;
8661 if (args_owned) {
8662 Py_DECREF(args);
8663 args_owned = 0;
8664 }
8665 args = PyObject_GetItem(dict, key);
8666 Py_DECREF(key);
8667 if (args == NULL) {
8668 goto onError;
8669 }
8670 args_owned = 1;
8671 arglen = -1;
8672 argidx = -2;
8673 }
8674 while (--fmtcnt >= 0) {
8675 switch (c = *fmt++) {
8676 case '-': flags |= F_LJUST; continue;
8677 case '+': flags |= F_SIGN; continue;
8678 case ' ': flags |= F_BLANK; continue;
8679 case '#': flags |= F_ALT; continue;
8680 case '0': flags |= F_ZERO; continue;
8681 }
8682 break;
8683 }
8684 if (c == '*') {
8685 v = getnextarg(args, arglen, &argidx);
8686 if (v == NULL)
8687 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008688 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 PyErr_SetString(PyExc_TypeError,
8690 "* wants int");
8691 goto onError;
8692 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008693 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008694 if (width == -1 && PyErr_Occurred())
8695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 if (width < 0) {
8697 flags |= F_LJUST;
8698 width = -width;
8699 }
8700 if (--fmtcnt >= 0)
8701 c = *fmt++;
8702 }
8703 else if (c >= '0' && c <= '9') {
8704 width = c - '0';
8705 while (--fmtcnt >= 0) {
8706 c = *fmt++;
8707 if (c < '0' || c > '9')
8708 break;
8709 if ((width*10) / 10 != width) {
8710 PyErr_SetString(PyExc_ValueError,
8711 "width too big");
8712 goto onError;
8713 }
8714 width = width*10 + (c - '0');
8715 }
8716 }
8717 if (c == '.') {
8718 prec = 0;
8719 if (--fmtcnt >= 0)
8720 c = *fmt++;
8721 if (c == '*') {
8722 v = getnextarg(args, arglen, &argidx);
8723 if (v == NULL)
8724 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008725 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726 PyErr_SetString(PyExc_TypeError,
8727 "* wants int");
8728 goto onError;
8729 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008730 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008731 if (prec == -1 && PyErr_Occurred())
8732 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 if (prec < 0)
8734 prec = 0;
8735 if (--fmtcnt >= 0)
8736 c = *fmt++;
8737 }
8738 else if (c >= '0' && c <= '9') {
8739 prec = c - '0';
8740 while (--fmtcnt >= 0) {
8741 c = Py_CHARMASK(*fmt++);
8742 if (c < '0' || c > '9')
8743 break;
8744 if ((prec*10) / 10 != prec) {
8745 PyErr_SetString(PyExc_ValueError,
8746 "prec too big");
8747 goto onError;
8748 }
8749 prec = prec*10 + (c - '0');
8750 }
8751 }
8752 } /* prec */
8753 if (fmtcnt >= 0) {
8754 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755 if (--fmtcnt >= 0)
8756 c = *fmt++;
8757 }
8758 }
8759 if (fmtcnt < 0) {
8760 PyErr_SetString(PyExc_ValueError,
8761 "incomplete format");
8762 goto onError;
8763 }
8764 if (c != '%') {
8765 v = getnextarg(args, arglen, &argidx);
8766 if (v == NULL)
8767 goto onError;
8768 }
8769 sign = 0;
8770 fill = ' ';
8771 switch (c) {
8772
8773 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008774 pbuf = formatbuf;
8775 /* presume that buffer length is at least 1 */
8776 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 len = 1;
8778 break;
8779
8780 case 's':
8781 case 'r':
8782 if (PyUnicode_Check(v) && c == 's') {
8783 temp = v;
8784 Py_INCREF(temp);
8785 }
8786 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008788 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 else
8790 temp = PyObject_Repr(v);
8791 if (temp == NULL)
8792 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008793 if (PyUnicode_Check(temp))
8794 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008795 else {
8796 Py_DECREF(temp);
8797 PyErr_SetString(PyExc_TypeError,
8798 "%s argument has non-string str()");
8799 goto onError;
8800 }
8801 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008802 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 len = PyUnicode_GET_SIZE(temp);
8804 if (prec >= 0 && len > prec)
8805 len = prec;
8806 break;
8807
8808 case 'i':
8809 case 'd':
8810 case 'u':
8811 case 'o':
8812 case 'x':
8813 case 'X':
8814 if (c == 'i')
8815 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00008816 isnumok = 0;
8817 if (PyNumber_Check(v)) {
8818 PyObject *iobj=NULL;
8819
8820 if (PyLong_Check(v)) {
8821 iobj = v;
8822 Py_INCREF(iobj);
8823 }
8824 else {
8825 iobj = PyNumber_Long(v);
8826 }
8827 if (iobj!=NULL) {
8828 if (PyLong_Check(iobj)) {
8829 isnumok = 1;
8830 temp = formatlong(iobj, flags, prec, c);
8831 Py_DECREF(iobj);
8832 if (!temp)
8833 goto onError;
8834 pbuf = PyUnicode_AS_UNICODE(temp);
8835 len = PyUnicode_GET_SIZE(temp);
8836 sign = 1;
8837 }
8838 else {
8839 Py_DECREF(iobj);
8840 }
8841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 }
Christian Heimesa612dc02008-02-24 13:08:18 +00008843 if (!isnumok) {
8844 PyErr_Format(PyExc_TypeError,
8845 "%%%c format: a number is required, "
8846 "not %.200s", c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008847 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008848 }
8849 if (flags & F_ZERO)
8850 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 break;
8852
8853 case 'e':
8854 case 'E':
8855 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008856 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 case 'g':
8858 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008859 if (c == 'F')
8860 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008861 pbuf = formatbuf;
8862 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8863 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864 if (len < 0)
8865 goto onError;
8866 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008867 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 fill = '0';
8869 break;
8870
8871 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008872 pbuf = formatbuf;
8873 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 if (len < 0)
8875 goto onError;
8876 break;
8877
8878 default:
8879 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008880 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008881 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008882 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008883 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008884 (Py_ssize_t)(fmt - 1 -
8885 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 goto onError;
8887 }
8888 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008889 if (*pbuf == '-' || *pbuf == '+') {
8890 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891 len--;
8892 }
8893 else if (flags & F_SIGN)
8894 sign = '+';
8895 else if (flags & F_BLANK)
8896 sign = ' ';
8897 else
8898 sign = 0;
8899 }
8900 if (width < len)
8901 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008902 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 reslen -= rescnt;
8904 rescnt = width + fmtcnt + 100;
8905 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008906 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008907 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008908 PyErr_NoMemory();
8909 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008910 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008911 if (_PyUnicode_Resize(&result, reslen) < 0) {
8912 Py_XDECREF(temp);
8913 goto onError;
8914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915 res = PyUnicode_AS_UNICODE(result)
8916 + reslen - rescnt;
8917 }
8918 if (sign) {
8919 if (fill != ' ')
8920 *res++ = sign;
8921 rescnt--;
8922 if (width > len)
8923 width--;
8924 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008925 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008926 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008927 assert(pbuf[1] == c);
8928 if (fill != ' ') {
8929 *res++ = *pbuf++;
8930 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008931 }
Tim Petersfff53252001-04-12 18:38:48 +00008932 rescnt -= 2;
8933 width -= 2;
8934 if (width < 0)
8935 width = 0;
8936 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 if (width > len && !(flags & F_LJUST)) {
8939 do {
8940 --rescnt;
8941 *res++ = fill;
8942 } while (--width > len);
8943 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008944 if (fill == ' ') {
8945 if (sign)
8946 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008947 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008948 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008949 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008950 *res++ = *pbuf++;
8951 *res++ = *pbuf++;
8952 }
8953 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008954 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 res += len;
8956 rescnt -= len;
8957 while (--width >= len) {
8958 --rescnt;
8959 *res++ = ' ';
8960 }
8961 if (dict && (argidx < arglen) && c != '%') {
8962 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008963 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008964 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 goto onError;
8966 }
8967 Py_XDECREF(temp);
8968 } /* '%' */
8969 } /* until end */
8970 if (argidx < arglen && !dict) {
8971 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008972 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 goto onError;
8974 }
8975
Thomas Woutersa96affe2006-03-12 00:29:36 +00008976 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8977 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 if (args_owned) {
8979 Py_DECREF(args);
8980 }
8981 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 return (PyObject *)result;
8983
8984 onError:
8985 Py_XDECREF(result);
8986 Py_DECREF(uformat);
8987 if (args_owned) {
8988 Py_DECREF(args);
8989 }
8990 return NULL;
8991}
8992
Jeremy Hylton938ace62002-07-17 16:30:39 +00008993static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008994unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8995
Tim Peters6d6c1a32001-08-02 04:15:00 +00008996static PyObject *
8997unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8998{
8999 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009000 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009001 char *encoding = NULL;
9002 char *errors = NULL;
9003
Guido van Rossume023fe02001-08-30 03:12:59 +00009004 if (type != &PyUnicode_Type)
9005 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009006 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
9007 kwlist, &x, &encoding, &errors))
9008 return NULL;
9009 if (x == NULL)
9010 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009011 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009012 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009013 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009014 return PyUnicode_FromEncodedObject(x, encoding, errors);
9015}
9016
Guido van Rossume023fe02001-08-30 03:12:59 +00009017static PyObject *
9018unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9019{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009020 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009021 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009022
9023 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9024 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9025 if (tmp == NULL)
9026 return NULL;
9027 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009028 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009029 if (pnew == NULL) {
9030 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009031 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009032 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009033 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
9034 if (pnew->str == NULL) {
9035 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009036 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009037 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009038 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009039 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009040 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9041 pnew->length = n;
9042 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009043 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009044 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009045}
9046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009047PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00009048"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009049\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009050Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009051encoding defaults to the current default string encoding.\n\
9052errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009053
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009054static PyObject *unicode_iter(PyObject *seq);
9055
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009057 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009058 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 sizeof(PyUnicodeObject), /* tp_size */
9060 0, /* tp_itemsize */
9061 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009062 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009064 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009066 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009067 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009068 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009070 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071 (hashfunc) unicode_hash, /* tp_hash*/
9072 0, /* tp_call*/
9073 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009074 PyObject_GenericGetAttr, /* tp_getattro */
9075 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009076 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009077 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9078 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009079 unicode_doc, /* tp_doc */
9080 0, /* tp_traverse */
9081 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009082 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009083 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009084 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009085 0, /* tp_iternext */
9086 unicode_methods, /* tp_methods */
9087 0, /* tp_members */
9088 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009089 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009090 0, /* tp_dict */
9091 0, /* tp_descr_get */
9092 0, /* tp_descr_set */
9093 0, /* tp_dictoffset */
9094 0, /* tp_init */
9095 0, /* tp_alloc */
9096 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009097 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098};
9099
9100/* Initialize the Unicode implementation */
9101
Thomas Wouters78890102000-07-22 19:25:51 +00009102void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009104 int i;
9105
Thomas Wouters477c8d52006-05-27 19:21:47 +00009106 /* XXX - move this array to unicodectype.c ? */
9107 Py_UNICODE linebreak[] = {
9108 0x000A, /* LINE FEED */
9109 0x000D, /* CARRIAGE RETURN */
9110 0x001C, /* FILE SEPARATOR */
9111 0x001D, /* GROUP SEPARATOR */
9112 0x001E, /* RECORD SEPARATOR */
9113 0x0085, /* NEXT LINE */
9114 0x2028, /* LINE SEPARATOR */
9115 0x2029, /* PARAGRAPH SEPARATOR */
9116 };
9117
Fred Drakee4315f52000-05-09 19:53:39 +00009118 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009119 free_list = NULL;
9120 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009122 if (!unicode_empty)
9123 return;
9124
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009125 for (i = 0; i < 256; i++)
9126 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009127 if (PyType_Ready(&PyUnicode_Type) < 0)
9128 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009129
9130 /* initialize the linebreak bloom filter */
9131 bloom_linebreak = make_bloom_mask(
9132 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9133 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009134
9135 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136}
9137
9138/* Finalize the Unicode implementation */
9139
Christian Heimesa156e092008-02-16 07:38:31 +00009140int
9141PyUnicode_ClearFreeList(void)
9142{
9143 int freelist_size = numfree;
9144 PyUnicodeObject *u;
9145
9146 for (u = free_list; u != NULL;) {
9147 PyUnicodeObject *v = u;
9148 u = *(PyUnicodeObject **)u;
9149 if (v->str)
9150 PyMem_DEL(v->str);
9151 Py_XDECREF(v->defenc);
9152 PyObject_Del(v);
9153 numfree--;
9154 }
9155 free_list = NULL;
9156 assert(numfree == 0);
9157 return freelist_size;
9158}
9159
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160void
Thomas Wouters78890102000-07-22 19:25:51 +00009161_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009163 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009165 Py_XDECREF(unicode_empty);
9166 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009167
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009168 for (i = 0; i < 256; i++) {
9169 if (unicode_latin1[i]) {
9170 Py_DECREF(unicode_latin1[i]);
9171 unicode_latin1[i] = NULL;
9172 }
9173 }
Christian Heimesa156e092008-02-16 07:38:31 +00009174 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009176
Walter Dörwald16807132007-05-25 13:52:07 +00009177void
9178PyUnicode_InternInPlace(PyObject **p)
9179{
9180 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9181 PyObject *t;
9182 if (s == NULL || !PyUnicode_Check(s))
9183 Py_FatalError(
9184 "PyUnicode_InternInPlace: unicode strings only please!");
9185 /* If it's a subclass, we don't really know what putting
9186 it in the interned dict might do. */
9187 if (!PyUnicode_CheckExact(s))
9188 return;
9189 if (PyUnicode_CHECK_INTERNED(s))
9190 return;
9191 if (interned == NULL) {
9192 interned = PyDict_New();
9193 if (interned == NULL) {
9194 PyErr_Clear(); /* Don't leave an exception */
9195 return;
9196 }
9197 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009198 /* It might be that the GetItem call fails even
9199 though the key is present in the dictionary,
9200 namely when this happens during a stack overflow. */
9201 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009202 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009203 Py_END_ALLOW_RECURSION
9204
Walter Dörwald16807132007-05-25 13:52:07 +00009205 if (t) {
9206 Py_INCREF(t);
9207 Py_DECREF(*p);
9208 *p = t;
9209 return;
9210 }
9211
Martin v. Löwis5b222132007-06-10 09:51:05 +00009212 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009213 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9214 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009215 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009216 return;
9217 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009218 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009219 /* The two references in interned are not counted by refcnt.
9220 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009221 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009222 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9223}
9224
9225void
9226PyUnicode_InternImmortal(PyObject **p)
9227{
9228 PyUnicode_InternInPlace(p);
9229 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9230 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9231 Py_INCREF(*p);
9232 }
9233}
9234
9235PyObject *
9236PyUnicode_InternFromString(const char *cp)
9237{
9238 PyObject *s = PyUnicode_FromString(cp);
9239 if (s == NULL)
9240 return NULL;
9241 PyUnicode_InternInPlace(&s);
9242 return s;
9243}
9244
9245void _Py_ReleaseInternedUnicodeStrings(void)
9246{
9247 PyObject *keys;
9248 PyUnicodeObject *s;
9249 Py_ssize_t i, n;
9250 Py_ssize_t immortal_size = 0, mortal_size = 0;
9251
9252 if (interned == NULL || !PyDict_Check(interned))
9253 return;
9254 keys = PyDict_Keys(interned);
9255 if (keys == NULL || !PyList_Check(keys)) {
9256 PyErr_Clear();
9257 return;
9258 }
9259
9260 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9261 detector, interned unicode strings are not forcibly deallocated;
9262 rather, we give them their stolen references back, and then clear
9263 and DECREF the interned dict. */
9264
9265 n = PyList_GET_SIZE(keys);
9266 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9267 n);
9268 for (i = 0; i < n; i++) {
9269 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9270 switch (s->state) {
9271 case SSTATE_NOT_INTERNED:
9272 /* XXX Shouldn't happen */
9273 break;
9274 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009275 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009276 immortal_size += s->length;
9277 break;
9278 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009279 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009280 mortal_size += s->length;
9281 break;
9282 default:
9283 Py_FatalError("Inconsistent interned string state.");
9284 }
9285 s->state = SSTATE_NOT_INTERNED;
9286 }
9287 fprintf(stderr, "total size of all interned strings: "
9288 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9289 "mortal/immortal\n", mortal_size, immortal_size);
9290 Py_DECREF(keys);
9291 PyDict_Clear(interned);
9292 Py_DECREF(interned);
9293 interned = NULL;
9294}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009295
9296
9297/********************* Unicode Iterator **************************/
9298
9299typedef struct {
9300 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009301 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009302 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9303} unicodeiterobject;
9304
9305static void
9306unicodeiter_dealloc(unicodeiterobject *it)
9307{
9308 _PyObject_GC_UNTRACK(it);
9309 Py_XDECREF(it->it_seq);
9310 PyObject_GC_Del(it);
9311}
9312
9313static int
9314unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9315{
9316 Py_VISIT(it->it_seq);
9317 return 0;
9318}
9319
9320static PyObject *
9321unicodeiter_next(unicodeiterobject *it)
9322{
9323 PyUnicodeObject *seq;
9324 PyObject *item;
9325
9326 assert(it != NULL);
9327 seq = it->it_seq;
9328 if (seq == NULL)
9329 return NULL;
9330 assert(PyUnicode_Check(seq));
9331
9332 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009333 item = PyUnicode_FromUnicode(
9334 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009335 if (item != NULL)
9336 ++it->it_index;
9337 return item;
9338 }
9339
9340 Py_DECREF(seq);
9341 it->it_seq = NULL;
9342 return NULL;
9343}
9344
9345static PyObject *
9346unicodeiter_len(unicodeiterobject *it)
9347{
9348 Py_ssize_t len = 0;
9349 if (it->it_seq)
9350 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009351 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009352}
9353
9354PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9355
9356static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009357 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9358 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009359 {NULL, NULL} /* sentinel */
9360};
9361
9362PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009363 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009364 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009365 sizeof(unicodeiterobject), /* tp_basicsize */
9366 0, /* tp_itemsize */
9367 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009368 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009369 0, /* tp_print */
9370 0, /* tp_getattr */
9371 0, /* tp_setattr */
9372 0, /* tp_compare */
9373 0, /* tp_repr */
9374 0, /* tp_as_number */
9375 0, /* tp_as_sequence */
9376 0, /* tp_as_mapping */
9377 0, /* tp_hash */
9378 0, /* tp_call */
9379 0, /* tp_str */
9380 PyObject_GenericGetAttr, /* tp_getattro */
9381 0, /* tp_setattro */
9382 0, /* tp_as_buffer */
9383 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9384 0, /* tp_doc */
9385 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9386 0, /* tp_clear */
9387 0, /* tp_richcompare */
9388 0, /* tp_weaklistoffset */
9389 PyObject_SelfIter, /* tp_iter */
9390 (iternextfunc)unicodeiter_next, /* tp_iternext */
9391 unicodeiter_methods, /* tp_methods */
9392 0,
9393};
9394
9395static PyObject *
9396unicode_iter(PyObject *seq)
9397{
9398 unicodeiterobject *it;
9399
9400 if (!PyUnicode_Check(seq)) {
9401 PyErr_BadInternalCall();
9402 return NULL;
9403 }
9404 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9405 if (it == NULL)
9406 return NULL;
9407 it->it_index = 0;
9408 Py_INCREF(seq);
9409 it->it_seq = (PyUnicodeObject *)seq;
9410 _PyObject_GC_TRACK(it);
9411 return (PyObject *)it;
9412}
9413
Martin v. Löwis5b222132007-06-10 09:51:05 +00009414size_t
9415Py_UNICODE_strlen(const Py_UNICODE *u)
9416{
9417 int res = 0;
9418 while(*u++)
9419 res++;
9420 return res;
9421}
9422
9423Py_UNICODE*
9424Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9425{
9426 Py_UNICODE *u = s1;
9427 while ((*u++ = *s2++));
9428 return s1;
9429}
9430
9431Py_UNICODE*
9432Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9433{
9434 Py_UNICODE *u = s1;
9435 while ((*u++ = *s2++))
9436 if (n-- == 0)
9437 break;
9438 return s1;
9439}
9440
9441int
9442Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9443{
9444 while (*s1 && *s2 && *s1 == *s2)
9445 s1++, s2++;
9446 if (*s1 && *s2)
9447 return (*s1 < *s2) ? -1 : +1;
9448 if (*s1)
9449 return 1;
9450 if (*s2)
9451 return -1;
9452 return 0;
9453}
9454
9455Py_UNICODE*
9456Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9457{
9458 const Py_UNICODE *p;
9459 for (p = s; *p; p++)
9460 if (*p == c)
9461 return (Py_UNICODE*)p;
9462 return NULL;
9463}
9464
9465
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009466#ifdef __cplusplus
9467}
9468#endif
9469
9470
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009471/*
9472Local variables:
9473c-basic-offset: 4
9474indent-tabs-mode: nil
9475End:
9476*/