blob: 86d8b547bfd225c8448c55f3f4ca71a967e369f9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
Christian Heimes2202f872008-02-06 14:31:34 +000057#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Christian Heimes2202f872008-02-06 14:31:34 +000065 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000109static PyUnicodeObject *free_list;
110static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes190d79e2008-01-30 11:58:22 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131// case 0x0009: /* HORIZONTAL TABULATION */
132// case 0x000A: /* LINE FEED */
133// case 0x000B: /* VERTICAL TABULATION */
134// case 0x000C: /* FORM FEED */
135// case 0x000D: /* CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138// case 0x001C: /* FILE SEPARATOR */
139// case 0x001D: /* GROUP SEPARATOR */
140// case 0x001E: /* RECORD SEPARATOR */
141// case 0x001F: /* UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143// case 0x0020: /* SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162// 0x000A, /* LINE FEED */
163// 0x000D, /* CARRIAGE RETURN */
164 0, 0, 1, 0, 0, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166// 0x001C, /* FILE SEPARATOR */
167// 0x001D, /* GROUP SEPARATOR */
168// 0x001E, /* RECORD SEPARATOR */
169 0, 0, 0, 0, 1, 1, 1, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0
183};
184
185
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000187PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000189#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190 return 0x10FFFF;
191#else
192 /* This is actually an illegal character, so it should
193 not be passed to unichr. */
194 return 0xFFFF;
195#endif
196}
197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198/* --- Bloom Filters ----------------------------------------------------- */
199
200/* stuff to implement simple "bloom filters" for Unicode characters.
201 to keep things simple, we use a single bitmask, using the least 5
202 bits from each unicode characters as the bit index. */
203
204/* the linebreak mask is set up by Unicode_Init below */
205
206#define BLOOM_MASK unsigned long
207
208static BLOOM_MASK bloom_linebreak;
209
210#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
211
Christian Heimes190d79e2008-01-30 11:58:22 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
220 long mask;
221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
225 mask |= (1 << (ptr[i] & 0x1F));
226
227 return mask;
228}
229
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
238 return 0;
239}
240
241#define BLOOM_MEMBER(mask, chr, set, setlen)\
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 if (unicode == unicode_empty ||
261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
275 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
276 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_NoMemory();
279 return -1;
280 }
281 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000284 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000286 if (unicode->defenc) {
287 Py_DECREF(unicode->defenc);
288 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 }
290 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return 0;
293}
294
295/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000296 Ux0000 terminated; some code (e.g. new_identifier)
297 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
299 XXX This allocator could further be enhanced by assuring that the
300 free list never reduces its size below 1.
301
302*/
303
304static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000305PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306{
307 register PyUnicodeObject *unicode;
308
Thomas Wouters477c8d52006-05-27 19:21:47 +0000309 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 if (length == 0 && unicode_empty != NULL) {
311 Py_INCREF(unicode_empty);
312 return unicode_empty;
313 }
314
315 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000316 if (free_list) {
317 unicode = free_list;
318 free_list = *(PyUnicodeObject **)unicode;
319 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000321 /* Keep-Alive optimization: we only upsize the buffer,
322 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000323 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000324 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000326 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 }
328 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000329 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000331 }
332 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 }
334 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000335 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 if (unicode == NULL)
337 return NULL;
338 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
339 }
340
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000341 if (!unicode->str) {
342 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000343 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000344 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000345 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000346 * the caller fails before initializing str -- unicode_resize()
347 * reads str[0], and the Keep-Alive optimization can keep memory
348 * allocated for str alive across a call to unicode_dealloc(unicode).
349 * We don't want unicode_resize to read uninitialized memory in
350 * that case.
351 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000354 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000356 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000357 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359
360 onError:
361 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000362 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000363 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364}
365
366static
Guido van Rossum9475a232001-10-05 20:51:39 +0000367void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368{
Walter Dörwald16807132007-05-25 13:52:07 +0000369 switch (PyUnicode_CHECK_INTERNED(unicode)) {
370 case SSTATE_NOT_INTERNED:
371 break;
372
373 case SSTATE_INTERNED_MORTAL:
374 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000375 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000376 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
377 Py_FatalError(
378 "deletion of interned unicode string failed");
379 break;
380
381 case SSTATE_INTERNED_IMMORTAL:
382 Py_FatalError("Immortal interned unicode string died.");
383
384 default:
385 Py_FatalError("Inconsistent interned unicode string state.");
386 }
387
Guido van Rossum604ddf82001-12-06 20:03:56 +0000388 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000389 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000390 /* Keep-Alive optimization */
391 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000392 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393 unicode->str = NULL;
394 unicode->length = 0;
395 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000396 if (unicode->defenc) {
397 Py_DECREF(unicode->defenc);
398 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 }
400 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000401 *(PyUnicodeObject **)unicode = free_list;
402 free_list = unicode;
403 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404 }
405 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000406 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000407 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000408 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000409 }
410}
411
Martin v. Löwis18e16552006-02-15 17:27:45 +0000412int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000413{
414 register PyUnicodeObject *v;
415
416 /* Argument checks */
417 if (unicode == NULL) {
418 PyErr_BadInternalCall();
419 return -1;
420 }
421 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000422 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000423 PyErr_BadInternalCall();
424 return -1;
425 }
426
427 /* Resizing unicode_empty and single character objects is not
428 possible since these are being shared. We simply return a fresh
429 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000430 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000431 (v == unicode_empty || v->length == 1)) {
432 PyUnicodeObject *w = _PyUnicode_New(length);
433 if (w == NULL)
434 return -1;
435 Py_UNICODE_COPY(w->str, v->str,
436 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000437 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000438 *unicode = (PyObject *)w;
439 return 0;
440 }
441
442 /* Note that we don't have to modify *unicode for unshared Unicode
443 objects, since we can modify them in-place. */
444 return unicode_resize(v, length);
445}
446
447/* Internal API for use in unicodeobject.c only ! */
448#define _PyUnicode_Resize(unicodevar, length) \
449 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
450
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000452 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453{
454 PyUnicodeObject *unicode;
455
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000456 /* If the Unicode data is known at construction time, we can apply
457 some optimizations which share commonly used objects. */
458 if (u != NULL) {
459
460 /* Optimization for empty strings */
461 if (size == 0 && unicode_empty != NULL) {
462 Py_INCREF(unicode_empty);
463 return (PyObject *)unicode_empty;
464 }
465
466 /* Single character Unicode objects in the Latin-1 range are
467 shared when using this constructor */
468 if (size == 1 && *u < 256) {
469 unicode = unicode_latin1[*u];
470 if (!unicode) {
471 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472 if (!unicode)
473 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000474 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000475 unicode_latin1[*u] = unicode;
476 }
477 Py_INCREF(unicode);
478 return (PyObject *)unicode;
479 }
480 }
Tim Petersced69f82003-09-16 20:30:58 +0000481
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482 unicode = _PyUnicode_New(size);
483 if (!unicode)
484 return NULL;
485
486 /* Copy the Unicode data into the new object */
487 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000488 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489
490 return (PyObject *)unicode;
491}
492
Walter Dörwaldd2034312007-05-18 16:29:38 +0000493PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000494{
495 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000496 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000497 some optimizations which share commonly used objects.
498 Also, this means the input must be UTF-8, so fall back to the
499 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000500 if (u != NULL) {
501
502 /* Optimization for empty strings */
503 if (size == 0 && unicode_empty != NULL) {
504 Py_INCREF(unicode_empty);
505 return (PyObject *)unicode_empty;
506 }
507
Martin v. Löwis9c121062007-08-05 20:26:11 +0000508 /* Single characters are shared when using this constructor.
509 Restrict to ASCII, since the input must be UTF-8. */
510 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000511 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000512 if (!unicode) {
513 unicode = _PyUnicode_New(1);
514 if (!unicode)
515 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000516 unicode->str[0] = Py_CHARMASK(*u);
517 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518 }
519 Py_INCREF(unicode);
520 return (PyObject *)unicode;
521 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000522
523 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 }
525
Walter Dörwald55507312007-05-18 13:12:10 +0000526 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 if (!unicode)
528 return NULL;
529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 return (PyObject *)unicode;
531}
532
Walter Dörwaldd2034312007-05-18 16:29:38 +0000533PyObject *PyUnicode_FromString(const char *u)
534{
535 size_t size = strlen(u);
536 if (size > PY_SSIZE_T_MAX) {
537 PyErr_SetString(PyExc_OverflowError, "input too long");
538 return NULL;
539 }
540
541 return PyUnicode_FromStringAndSize(u, size);
542}
543
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544#ifdef HAVE_WCHAR_H
545
546PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000547 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548{
549 PyUnicodeObject *unicode;
550
551 if (w == NULL) {
552 PyErr_BadInternalCall();
553 return NULL;
554 }
555
556 unicode = _PyUnicode_New(size);
557 if (!unicode)
558 return NULL;
559
560 /* Copy the wchar_t data into the new object */
561#ifdef HAVE_USABLE_WCHAR_T
562 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000563#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564 {
565 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000566 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000568 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000569 *u++ = *w++;
570 }
571#endif
572
573 return (PyObject *)unicode;
574}
575
Walter Dörwald346737f2007-05-31 10:44:43 +0000576static void
577makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
578{
579 *fmt++ = '%';
580 if (width) {
581 if (zeropad)
582 *fmt++ = '0';
583 fmt += sprintf(fmt, "%d", width);
584 }
585 if (precision)
586 fmt += sprintf(fmt, ".%d", precision);
587 if (longflag)
588 *fmt++ = 'l';
589 else if (size_tflag) {
590 char *f = PY_FORMAT_SIZE_T;
591 while (*f)
592 *fmt++ = *f++;
593 }
594 *fmt++ = c;
595 *fmt = '\0';
596}
597
Walter Dörwaldd2034312007-05-18 16:29:38 +0000598#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
599
600PyObject *
601PyUnicode_FromFormatV(const char *format, va_list vargs)
602{
603 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000604 Py_ssize_t callcount = 0;
605 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000606 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000607 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000608 int width = 0;
609 int precision = 0;
610 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000611 const char* f;
612 Py_UNICODE *s;
613 PyObject *string;
614 /* used by sprintf */
615 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000616 /* use abuffer instead of buffer, if we need more space
617 * (which can happen if there's a format specifier with width). */
618 char *abuffer = NULL;
619 char *realbuffer;
620 Py_ssize_t abuffersize = 0;
621 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 const char *copy;
623
624#ifdef VA_LIST_IS_ARRAY
625 Py_MEMCPY(count, vargs, sizeof(va_list));
626#else
627#ifdef __va_copy
628 __va_copy(count, vargs);
629#else
630 count = vargs;
631#endif
632#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000633 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000634 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000635 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000636 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000637 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000638 ++callcount;
639 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000640 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000641 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000642 if (callcount) {
643 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
644 if (!callresults) {
645 PyErr_NoMemory();
646 return NULL;
647 }
648 callresult = callresults;
649 }
650 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000651 for (f = format; *f; f++) {
652 if (*f == '%') {
653 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000654 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000655 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000656 width = (width*10) + *f++ - '0';
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000657 while (*++f && *f != '%' && !ISALPHA(*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000658 ;
659
660 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
661 * they don't affect the amount of space we reserve.
662 */
663 if ((*f == 'l' || *f == 'z') &&
664 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000665 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000666
667 switch (*f) {
668 case 'c':
669 (void)va_arg(count, int);
670 /* fall through... */
671 case '%':
672 n++;
673 break;
674 case 'd': case 'u': case 'i': case 'x':
675 (void) va_arg(count, int);
676 /* 20 bytes is enough to hold a 64-bit
677 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000678 This isn't enough for octal.
679 If a width is specified we need more
680 (which we allocate later). */
681 if (width < 20)
682 width = 20;
683 n += width;
684 if (abuffersize < width)
685 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000686 break;
687 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000688 {
689 /* UTF-8 */
690 unsigned char*s;
691 s = va_arg(count, unsigned char*);
692 while (*s) {
693 if (*s < 128) {
694 n++; s++;
695 } else if (*s < 0xc0) {
696 /* invalid UTF-8 */
697 n++; s++;
698 } else if (*s < 0xc0) {
699 n++;
700 s++; if(!*s)break;
701 s++;
702 } else if (*s < 0xe0) {
703 n++;
704 s++; if(!*s)break;
705 s++; if(!*s)break;
706 s++;
707 } else {
708 #ifdef Py_UNICODE_WIDE
709 n++;
710 #else
711 n+=2;
712 #endif
713 s++; if(!*s)break;
714 s++; if(!*s)break;
715 s++; if(!*s)break;
716 s++;
717 }
718 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000720 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000721 case 'U':
722 {
723 PyObject *obj = va_arg(count, PyObject *);
724 assert(obj && PyUnicode_Check(obj));
725 n += PyUnicode_GET_SIZE(obj);
726 break;
727 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000728 case 'V':
729 {
730 PyObject *obj = va_arg(count, PyObject *);
731 const char *str = va_arg(count, const char *);
732 assert(obj || str);
733 assert(!obj || PyUnicode_Check(obj));
734 if (obj)
735 n += PyUnicode_GET_SIZE(obj);
736 else
737 n += strlen(str);
738 break;
739 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000740 case 'S':
741 {
742 PyObject *obj = va_arg(count, PyObject *);
743 PyObject *str;
744 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000745 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000746 if (!str)
747 goto fail;
748 n += PyUnicode_GET_SIZE(str);
749 /* Remember the str and switch to the next slot */
750 *callresult++ = str;
751 break;
752 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000753 case 'R':
754 {
755 PyObject *obj = va_arg(count, PyObject *);
756 PyObject *repr;
757 assert(obj);
758 repr = PyObject_Repr(obj);
759 if (!repr)
760 goto fail;
761 n += PyUnicode_GET_SIZE(repr);
762 /* Remember the repr and switch to the next slot */
763 *callresult++ = repr;
764 break;
765 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 case 'p':
767 (void) va_arg(count, int);
768 /* maximum 64-bit pointer representation:
769 * 0xffffffffffffffff
770 * so 19 characters is enough.
771 * XXX I count 18 -- what's the extra for?
772 */
773 n += 19;
774 break;
775 default:
776 /* if we stumble upon an unknown
777 formatting code, copy the rest of
778 the format string to the output
779 string. (we cannot just skip the
780 code, since there's no way to know
781 what's in the argument list) */
782 n += strlen(p);
783 goto expand;
784 }
785 } else
786 n++;
787 }
788 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000789 if (abuffersize > 20) {
790 abuffer = PyMem_Malloc(abuffersize);
791 if (!abuffer) {
792 PyErr_NoMemory();
793 goto fail;
794 }
795 realbuffer = abuffer;
796 }
797 else
798 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000799 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000800 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000801 we don't have to resize the string.
802 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000803 string = PyUnicode_FromUnicode(NULL, n);
804 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000805 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000806
807 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000808 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
810 for (f = format; *f; f++) {
811 if (*f == '%') {
812 const char* p = f++;
813 int longflag = 0;
814 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000815 zeropad = (*f == '0');
816 /* parse the width.precision part */
817 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000818 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000819 width = (width*10) + *f++ - '0';
820 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000821 if (*f == '.') {
822 f++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000823 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000824 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000825 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000826 /* handle the long flag, but only for %ld and %lu.
827 others can be added when necessary. */
828 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
829 longflag = 1;
830 ++f;
831 }
832 /* handle the size_t flag. */
833 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
834 size_tflag = 1;
835 ++f;
836 }
837
838 switch (*f) {
839 case 'c':
840 *s++ = va_arg(vargs, int);
841 break;
842 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000844 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000845 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000846 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000847 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000849 sprintf(realbuffer, fmt, va_arg(vargs, int));
850 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000851 break;
852 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000854 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000855 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000856 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000858 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
860 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 break;
862 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000863 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
864 sprintf(realbuffer, fmt, va_arg(vargs, int));
865 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000866 break;
867 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000868 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
869 sprintf(realbuffer, fmt, va_arg(vargs, int));
870 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000871 break;
872 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000873 {
874 /* Parameter must be UTF-8 encoded.
875 In case of encoding errors, use
876 the replacement character. */
877 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000878 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000879 u = PyUnicode_DecodeUTF8(p, strlen(p),
880 "replace");
881 if (!u)
882 goto fail;
883 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
884 PyUnicode_GET_SIZE(u));
885 s += PyUnicode_GET_SIZE(u);
886 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000888 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000889 case 'U':
890 {
891 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000892 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
893 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
894 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000895 break;
896 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000897 case 'V':
898 {
899 PyObject *obj = va_arg(vargs, PyObject *);
900 const char *str = va_arg(vargs, const char *);
901 if (obj) {
902 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
903 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
904 s += size;
905 } else {
906 appendstring(str);
907 }
908 break;
909 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000910 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000911 case 'R':
912 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000913 Py_UNICODE *ucopy;
914 Py_ssize_t usize;
915 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000916 /* unused, since we already have the result */
917 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000918 ucopy = PyUnicode_AS_UNICODE(*callresult);
919 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000920 for (upos = 0; upos<usize;)
921 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000922 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000923 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000924 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000925 ++callresult;
926 break;
927 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000928 case 'p':
929 sprintf(buffer, "%p", va_arg(vargs, void*));
930 /* %p is ill-defined: ensure leading 0x. */
931 if (buffer[1] == 'X')
932 buffer[1] = 'x';
933 else if (buffer[1] != 'x') {
934 memmove(buffer+2, buffer, strlen(buffer)+1);
935 buffer[0] = '0';
936 buffer[1] = 'x';
937 }
938 appendstring(buffer);
939 break;
940 case '%':
941 *s++ = '%';
942 break;
943 default:
944 appendstring(p);
945 goto end;
946 }
947 } else
948 *s++ = *f;
949 }
950
951 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000952 if (callresults)
953 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000954 if (abuffer)
955 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000956 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
957 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000958 fail:
959 if (callresults) {
960 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000961 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000962 Py_DECREF(*callresult2);
963 ++callresult2;
964 }
965 PyMem_Free(callresults);
966 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000967 if (abuffer)
968 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000969 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970}
971
972#undef appendstring
973
974PyObject *
975PyUnicode_FromFormat(const char *format, ...)
976{
977 PyObject* ret;
978 va_list vargs;
979
980#ifdef HAVE_STDARG_PROTOTYPES
981 va_start(vargs, format);
982#else
983 va_start(vargs);
984#endif
985 ret = PyUnicode_FromFormatV(format, vargs);
986 va_end(vargs);
987 return ret;
988}
989
Martin v. Löwis18e16552006-02-15 17:27:45 +0000990Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
991 wchar_t *w,
992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000993{
994 if (unicode == NULL) {
995 PyErr_BadInternalCall();
996 return -1;
997 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000998
999 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001000 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001001 size = PyUnicode_GET_SIZE(unicode) + 1;
1002
Guido van Rossumd57fd912000-03-10 22:53:23 +00001003#ifdef HAVE_USABLE_WCHAR_T
1004 memcpy(w, unicode->str, size * sizeof(wchar_t));
1005#else
1006 {
1007 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001008 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001010 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 *w++ = *u++;
1012 }
1013#endif
1014
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001015 if (size > PyUnicode_GET_SIZE(unicode))
1016 return PyUnicode_GET_SIZE(unicode);
1017 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001018 return size;
1019}
1020
1021#endif
1022
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001023PyObject *PyUnicode_FromOrdinal(int ordinal)
1024{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001025 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001026
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001027 if (ordinal < 0 || ordinal > 0x10ffff) {
1028 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001029 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001030 return NULL;
1031 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001032
1033#ifndef Py_UNICODE_WIDE
1034 if (ordinal > 0xffff) {
1035 ordinal -= 0x10000;
1036 s[0] = 0xD800 | (ordinal >> 10);
1037 s[1] = 0xDC00 | (ordinal & 0x3FF);
1038 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001039 }
1040#endif
1041
Hye-Shik Chang40574832004-04-06 07:24:51 +00001042 s[0] = (Py_UNICODE)ordinal;
1043 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001044}
1045
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046PyObject *PyUnicode_FromObject(register PyObject *obj)
1047{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001048 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001049 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001050 if (PyUnicode_CheckExact(obj)) {
1051 Py_INCREF(obj);
1052 return obj;
1053 }
1054 if (PyUnicode_Check(obj)) {
1055 /* For a Unicode subtype that's not a Unicode object,
1056 return a true Unicode object with the same data. */
1057 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1058 PyUnicode_GET_SIZE(obj));
1059 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001060 PyErr_Format(PyExc_TypeError,
1061 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001062 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001063 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001064}
1065
1066PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1067 const char *encoding,
1068 const char *errors)
1069{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001070 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001071 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001072 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001073
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 if (obj == NULL) {
1075 PyErr_BadInternalCall();
1076 return NULL;
1077 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001078
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079 if (PyUnicode_Check(obj)) {
1080 PyErr_SetString(PyExc_TypeError,
1081 "decoding Unicode is not supported");
1082 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001084
1085 /* Coerce object */
1086 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001087 s = PyString_AS_STRING(obj);
1088 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001089 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1091 /* Overwrite the error message with something more useful in
1092 case of a TypeError. */
1093 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001094 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001095 "coercing to Unicode: need string or buffer, "
1096 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001097 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001098 goto onError;
1099 }
Tim Petersced69f82003-09-16 20:30:58 +00001100
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001101 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 if (len == 0) {
1103 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001104 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 }
Tim Petersced69f82003-09-16 20:30:58 +00001106 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001107 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001108
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 return v;
1110
1111 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113}
1114
1115PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001116 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 const char *encoding,
1118 const char *errors)
1119{
1120 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001121 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001122 char lower[20]; /* Enough for any encoding name we recognize */
1123 char *l;
1124 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001125
1126 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001127 encoding = PyUnicode_GetDefaultEncoding();
1128
1129 /* Convert encoding to lower case and replace '_' with '-' in order to
1130 catch e.g. UTF_8 */
1131 e = encoding;
1132 l = lower;
1133 while (*e && l < &lower[(sizeof lower) - 2]) {
1134 if (ISUPPER(*e)) {
1135 *l++ = TOLOWER(*e++);
1136 }
1137 else if (*e == '_') {
1138 *l++ = '-';
1139 e++;
1140 }
1141 else {
1142 *l++ = *e++;
1143 }
1144 }
1145 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001146
1147 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001148 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001150 else if ((strcmp(lower, "latin-1") == 0) ||
1151 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001152 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001153#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001154 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001155 return PyUnicode_DecodeMBCS(s, size, errors);
1156#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001157 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001158 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001159 else if (strcmp(lower, "utf-16") == 0)
1160 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1161 else if (strcmp(lower, "utf-32") == 0)
1162 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163
1164 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001165 buffer = NULL;
1166 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1167 goto onError;
1168 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 if (buffer == NULL)
1170 goto onError;
1171 unicode = PyCodec_Decode(buffer, encoding, errors);
1172 if (unicode == NULL)
1173 goto onError;
1174 if (!PyUnicode_Check(unicode)) {
1175 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001176 "decoder did not return an unicode object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001177 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 Py_DECREF(unicode);
1179 goto onError;
1180 }
1181 Py_DECREF(buffer);
1182 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001183
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 onError:
1185 Py_XDECREF(buffer);
1186 return NULL;
1187}
1188
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001189PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1190 const char *encoding,
1191 const char *errors)
1192{
1193 PyObject *v;
1194
1195 if (!PyUnicode_Check(unicode)) {
1196 PyErr_BadArgument();
1197 goto onError;
1198 }
1199
1200 if (encoding == NULL)
1201 encoding = PyUnicode_GetDefaultEncoding();
1202
1203 /* Decode via the codec registry */
1204 v = PyCodec_Decode(unicode, encoding, errors);
1205 if (v == NULL)
1206 goto onError;
1207 return v;
1208
1209 onError:
1210 return NULL;
1211}
1212
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001214 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 const char *encoding,
1216 const char *errors)
1217{
1218 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001219
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 unicode = PyUnicode_FromUnicode(s, size);
1221 if (unicode == NULL)
1222 return NULL;
1223 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1224 Py_DECREF(unicode);
1225 return v;
1226}
1227
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001228PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1229 const char *encoding,
1230 const char *errors)
1231{
1232 PyObject *v;
1233
1234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadArgument();
1236 goto onError;
1237 }
1238
1239 if (encoding == NULL)
1240 encoding = PyUnicode_GetDefaultEncoding();
1241
1242 /* Encode via the codec registry */
1243 v = PyCodec_Encode(unicode, encoding, errors);
1244 if (v == NULL)
1245 goto onError;
1246 return v;
1247
1248 onError:
1249 return NULL;
1250}
1251
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1253 const char *encoding,
1254 const char *errors)
1255{
1256 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001257
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 if (!PyUnicode_Check(unicode)) {
1259 PyErr_BadArgument();
1260 goto onError;
1261 }
Fred Drakee4315f52000-05-09 19:53:39 +00001262
Tim Petersced69f82003-09-16 20:30:58 +00001263 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001264 encoding = PyUnicode_GetDefaultEncoding();
1265
1266 /* Shortcuts for common default encodings */
1267 if (errors == NULL) {
1268 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001269 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001270 else if (strcmp(encoding, "latin-1") == 0)
1271 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001272#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1273 else if (strcmp(encoding, "mbcs") == 0)
1274 return PyUnicode_AsMBCSString(unicode);
1275#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001276 else if (strcmp(encoding, "ascii") == 0)
1277 return PyUnicode_AsASCIIString(unicode);
1278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279
1280 /* Encode via the codec registry */
1281 v = PyCodec_Encode(unicode, encoding, errors);
1282 if (v == NULL)
1283 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001284 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001286
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287 onError:
1288 return NULL;
1289}
1290
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001291PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1292 const char *errors)
1293{
1294 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001295 if (v)
1296 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001297 if (errors != NULL)
1298 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001299 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001300 PyUnicode_GET_SIZE(unicode),
1301 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001302 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001303 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001304 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001305 return v;
1306}
1307
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001308PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001309PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001310 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001311 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1312}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001313
Christian Heimes5894ba72007-11-04 11:43:14 +00001314PyObject*
1315PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1316{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001317 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1318 can be undefined. If it is case, decode using UTF-8. The following assumes
1319 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1320 bootstrapping process where the codecs aren't ready yet.
1321 */
1322 if (Py_FileSystemDefaultEncoding) {
1323#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001324 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001325 return PyUnicode_DecodeMBCS(s, size, "replace");
1326 }
1327#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001328 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001329 return PyUnicode_DecodeUTF8(s, size, "replace");
1330 }
1331#endif
1332 return PyUnicode_Decode(s, size,
1333 Py_FileSystemDefaultEncoding,
1334 "replace");
1335 }
1336 else {
1337 return PyUnicode_DecodeUTF8(s, size, "replace");
1338 }
1339}
1340
Martin v. Löwis5b222132007-06-10 09:51:05 +00001341char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001342PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001343{
Christian Heimesf3863112007-11-22 07:46:41 +00001344 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 return NULL;
1348 }
Christian Heimesf3863112007-11-22 07:46:41 +00001349 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1350 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001351 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001352 if (psize != NULL)
Christian Heimesf3863112007-11-22 07:46:41 +00001353 *psize = PyString_GET_SIZE(bytes);
1354 return PyString_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001355}
1356
1357char*
1358PyUnicode_AsString(PyObject *unicode)
1359{
1360 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001361}
1362
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1364{
1365 if (!PyUnicode_Check(unicode)) {
1366 PyErr_BadArgument();
1367 goto onError;
1368 }
1369 return PyUnicode_AS_UNICODE(unicode);
1370
1371 onError:
1372 return NULL;
1373}
1374
Martin v. Löwis18e16552006-02-15 17:27:45 +00001375Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376{
1377 if (!PyUnicode_Check(unicode)) {
1378 PyErr_BadArgument();
1379 goto onError;
1380 }
1381 return PyUnicode_GET_SIZE(unicode);
1382
1383 onError:
1384 return -1;
1385}
1386
Thomas Wouters78890102000-07-22 19:25:51 +00001387const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001388{
1389 return unicode_default_encoding;
1390}
1391
1392int PyUnicode_SetDefaultEncoding(const char *encoding)
1393{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001394 if (strcmp(encoding, unicode_default_encoding) != 0) {
1395 PyErr_Format(PyExc_ValueError,
1396 "Can only set default encoding to %s",
1397 unicode_default_encoding);
1398 return -1;
1399 }
Fred Drakee4315f52000-05-09 19:53:39 +00001400 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001401}
1402
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403/* error handling callback helper:
1404 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001405 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406 and adjust various state variables.
1407 return 0 on success, -1 on error
1408*/
1409
1410static
1411int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1412 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001413 const char **input, const char **inend, Py_ssize_t *startinpos,
1414 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001417 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001418
1419 PyObject *restuple = NULL;
1420 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001422 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t requiredsize;
1424 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001425 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001426 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001427 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001428 int res = -1;
1429
1430 if (*errorHandler == NULL) {
1431 *errorHandler = PyCodec_LookupError(errors);
1432 if (*errorHandler == NULL)
1433 goto onError;
1434 }
1435
1436 if (*exceptionObject == NULL) {
1437 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001438 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001439 if (*exceptionObject == NULL)
1440 goto onError;
1441 }
1442 else {
1443 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1444 goto onError;
1445 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1446 goto onError;
1447 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1448 goto onError;
1449 }
1450
1451 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1452 if (restuple == NULL)
1453 goto onError;
1454 if (!PyTuple_Check(restuple)) {
1455 PyErr_Format(PyExc_TypeError, &argparse[4]);
1456 goto onError;
1457 }
1458 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1459 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001460
1461 /* Copy back the bytes variables, which might have been modified by the
1462 callback */
1463 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1464 if (!inputobj)
1465 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001466 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001467 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1468 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001469 *input = PyString_AS_STRING(inputobj);
1470 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001471 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001472 /* we can DECREF safely, as the exception has another reference,
1473 so the object won't go away. */
1474 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001476 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001477 newpos = insize+newpos;
1478 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001479 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001480 goto onError;
1481 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482
1483 /* need more space? (at least enough for what we
1484 have+the replacement+the rest of the string (starting
1485 at the new input position), so we won't have to check space
1486 when there are no errors in the rest of the string) */
1487 repptr = PyUnicode_AS_UNICODE(repunicode);
1488 repsize = PyUnicode_GET_SIZE(repunicode);
1489 requiredsize = *outpos + repsize + insize-newpos;
1490 if (requiredsize > outsize) {
1491 if (requiredsize<2*outsize)
1492 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001493 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 goto onError;
1495 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1496 }
1497 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001498 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499 Py_UNICODE_COPY(*outptr, repptr, repsize);
1500 *outptr += repsize;
1501 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 /* we made it! */
1504 res = 0;
1505
1506 onError:
1507 Py_XDECREF(restuple);
1508 return res;
1509}
1510
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001511/* --- UTF-7 Codec -------------------------------------------------------- */
1512
1513/* see RFC2152 for details */
1514
Tim Petersced69f82003-09-16 20:30:58 +00001515static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001516char utf7_special[128] = {
1517 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1518 encoded:
1519 0 - not special
1520 1 - special
1521 2 - whitespace (optional)
1522 3 - RFC2152 Set O (optional) */
1523 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1524 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1525 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1527 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1529 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1531
1532};
1533
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001534/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1535 warnings about the comparison always being false; since
1536 utf7_special[0] is 1, we can safely make that one comparison
1537 true */
1538
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001539#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001540 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001541 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001542 (encodeO && (utf7_special[(c)] == 3)))
1543
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001544#define B64(n) \
1545 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1546#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001547 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001548#define UB64(c) \
1549 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1550 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001552#define ENCODE(out, ch, bits) \
1553 while (bits >= 6) { \
1554 *out++ = B64(ch >> (bits-6)); \
1555 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556 }
1557
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001558#define DECODE(out, ch, bits, surrogate) \
1559 while (bits >= 16) { \
1560 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1561 bits -= 16; \
1562 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001563 /* We have already generated an error for the high surrogate \
1564 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001565 surrogate = 0; \
1566 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001568 it in a 16-bit character */ \
1569 surrogate = 1; \
1570 errmsg = "code pairs are not supported"; \
1571 goto utf7Error; \
1572 } else { \
1573 *out++ = outCh; \
1574 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001575 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 const char *errors)
1580{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001581 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1582}
1583
1584PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1585 Py_ssize_t size,
1586 const char *errors,
1587 Py_ssize_t *consumed)
1588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001590 Py_ssize_t startinpos;
1591 Py_ssize_t endinpos;
1592 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 const char *e;
1594 PyUnicodeObject *unicode;
1595 Py_UNICODE *p;
1596 const char *errmsg = "";
1597 int inShift = 0;
1598 unsigned int bitsleft = 0;
1599 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 int surrogate = 0;
1601 PyObject *errorHandler = NULL;
1602 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603
1604 unicode = _PyUnicode_New(size);
1605 if (!unicode)
1606 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001607 if (size == 0) {
1608 if (consumed)
1609 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001611 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001612
1613 p = unicode->str;
1614 e = s + size;
1615
1616 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 Py_UNICODE ch;
1618 restart:
1619 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620
1621 if (inShift) {
1622 if ((ch == '-') || !B64CHAR(ch)) {
1623 inShift = 0;
1624 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001625
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1627 if (bitsleft >= 6) {
1628 /* The shift sequence has a partial character in it. If
1629 bitsleft < 6 then we could just classify it as padding
1630 but that is not the case here */
1631
1632 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001633 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634 }
1635 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001636 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637 here so indicate the potential of a misencoded character. */
1638
1639 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1640 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1641 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001642 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001643 }
1644
1645 if (ch == '-') {
1646 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001647 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 inShift = 1;
1649 }
1650 } else if (SPECIAL(ch,0,0)) {
1651 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001652 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 } else {
1654 *p++ = ch;
1655 }
1656 } else {
1657 charsleft = (charsleft << 6) | UB64(ch);
1658 bitsleft += 6;
1659 s++;
1660 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1661 }
1662 }
1663 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 s++;
1666 if (s < e && *s == '-') {
1667 s++;
1668 *p++ = '+';
1669 } else
1670 {
1671 inShift = 1;
1672 bitsleft = 0;
1673 }
1674 }
1675 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001676 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 errmsg = "unexpected special character";
1678 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001679 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 }
1681 else {
1682 *p++ = ch;
1683 s++;
1684 }
1685 continue;
1686 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001687 outpos = p-PyUnicode_AS_UNICODE(unicode);
1688 endinpos = s-starts;
1689 if (unicode_decode_call_errorhandler(
1690 errors, &errorHandler,
1691 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001692 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001693 (PyObject **)&unicode, &outpos, &p))
1694 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 }
1696
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001697 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 outpos = p-PyUnicode_AS_UNICODE(unicode);
1699 endinpos = size;
1700 if (unicode_decode_call_errorhandler(
1701 errors, &errorHandler,
1702 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001703 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001705 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 if (s < e)
1707 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001709 if (consumed) {
1710 if(inShift)
1711 *consumed = startinpos;
1712 else
1713 *consumed = s-starts;
1714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001716 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001717 goto onError;
1718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 Py_XDECREF(errorHandler);
1720 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001721 return (PyObject *)unicode;
1722
1723onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001724 Py_XDECREF(errorHandler);
1725 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 Py_DECREF(unicode);
1727 return NULL;
1728}
1729
1730
1731PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001732 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001733 int encodeSetO,
1734 int encodeWhiteSpace,
1735 const char *errors)
1736{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001737 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001739 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001741 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742 unsigned int bitsleft = 0;
1743 unsigned long charsleft = 0;
1744 char * out;
1745 char * start;
1746
1747 if (size == 0)
Christian Heimesf3863112007-11-22 07:46:41 +00001748 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749
Walter Dörwald51ab4142007-05-05 14:43:36 +00001750 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751 if (v == NULL)
1752 return NULL;
1753
Walter Dörwald51ab4142007-05-05 14:43:36 +00001754 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 for (;i < size; ++i) {
1756 Py_UNICODE ch = s[i];
1757
1758 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001759 if (ch == '+') {
1760 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 *out++ = '-';
1762 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1763 charsleft = ch;
1764 bitsleft = 16;
1765 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001766 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001768 } else {
1769 *out++ = (char) ch;
1770 }
1771 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001772 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1773 *out++ = B64(charsleft << (6-bitsleft));
1774 charsleft = 0;
1775 bitsleft = 0;
1776 /* Characters not in the BASE64 set implicitly unshift the sequence
1777 so no '-' is required, except if the character is itself a '-' */
1778 if (B64CHAR(ch) || ch == '-') {
1779 *out++ = '-';
1780 }
1781 inShift = 0;
1782 *out++ = (char) ch;
1783 } else {
1784 bitsleft += 16;
1785 charsleft = (charsleft << 16) | ch;
1786 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1787
1788 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001789 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001790 or '-' then the shift sequence will be terminated implicitly and we
1791 don't have to insert a '-'. */
1792
1793 if (bitsleft == 0) {
1794 if (i + 1 < size) {
1795 Py_UNICODE ch2 = s[i+1];
1796
1797 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001798
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001799 } else if (B64CHAR(ch2) || ch2 == '-') {
1800 *out++ = '-';
1801 inShift = 0;
1802 } else {
1803 inShift = 0;
1804 }
1805
1806 }
1807 else {
1808 *out++ = '-';
1809 inShift = 0;
1810 }
1811 }
Tim Petersced69f82003-09-16 20:30:58 +00001812 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001813 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001814 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001815 if (bitsleft) {
1816 *out++= B64(charsleft << (6-bitsleft) );
1817 *out++ = '-';
1818 }
1819
Guido van Rossum98297ee2007-11-06 21:34:58 +00001820 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1821 Py_DECREF(v);
1822 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001823}
1824
1825#undef SPECIAL
1826#undef B64
1827#undef B64CHAR
1828#undef UB64
1829#undef ENCODE
1830#undef DECODE
1831
Guido van Rossumd57fd912000-03-10 22:53:23 +00001832/* --- UTF-8 Codec -------------------------------------------------------- */
1833
Tim Petersced69f82003-09-16 20:30:58 +00001834static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835char utf8_code_length[256] = {
1836 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1837 illegal prefix. see RFC 2279 for details */
1838 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1839 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1840 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1841 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1842 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1844 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1846 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1850 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1851 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1852 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1853 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1854};
1855
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001857 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 const char *errors)
1859{
Walter Dörwald69652032004-09-07 20:24:22 +00001860 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1861}
1862
1863PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001864 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001865 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001866 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001867{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001870 Py_ssize_t startinpos;
1871 Py_ssize_t endinpos;
1872 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873 const char *e;
1874 PyUnicodeObject *unicode;
1875 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001876 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 PyObject *errorHandler = NULL;
1878 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879
1880 /* Note: size will always be longer than the resulting Unicode
1881 character count */
1882 unicode = _PyUnicode_New(size);
1883 if (!unicode)
1884 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001885 if (size == 0) {
1886 if (consumed)
1887 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890
1891 /* Unpack UTF-8 encoded data */
1892 p = unicode->str;
1893 e = s + size;
1894
1895 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001896 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897
1898 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001899 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900 s++;
1901 continue;
1902 }
1903
1904 n = utf8_code_length[ch];
1905
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001907 if (consumed)
1908 break;
1909 else {
1910 errmsg = "unexpected end of data";
1911 startinpos = s-starts;
1912 endinpos = size;
1913 goto utf8Error;
1914 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001916
1917 switch (n) {
1918
1919 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001920 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001921 startinpos = s-starts;
1922 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001923 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924
1925 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001926 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 startinpos = s-starts;
1928 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001929 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930
1931 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001932 if ((s[1] & 0xc0) != 0x80) {
1933 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 startinpos = s-starts;
1935 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 goto utf8Error;
1937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001939 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 startinpos = s-starts;
1941 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001942 errmsg = "illegal encoding";
1943 goto utf8Error;
1944 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001946 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 break;
1948
1949 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001950 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001951 (s[2] & 0xc0) != 0x80) {
1952 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001953 startinpos = s-starts;
1954 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001955 goto utf8Error;
1956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001958 if (ch < 0x0800) {
1959 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001960 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001961
1962 XXX For wide builds (UCS-4) we should probably try
1963 to recombine the surrogates into a single code
1964 unit.
1965 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001966 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001967 startinpos = s-starts;
1968 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001969 goto utf8Error;
1970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001972 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001973 break;
1974
1975 case 4:
1976 if ((s[1] & 0xc0) != 0x80 ||
1977 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001978 (s[3] & 0xc0) != 0x80) {
1979 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001980 startinpos = s-starts;
1981 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001982 goto utf8Error;
1983 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001984 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1985 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1986 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001987 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001988 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001989 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001990 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001991 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001992 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001993 startinpos = s-starts;
1994 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001995 goto utf8Error;
1996 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001997#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001998 *p++ = (Py_UNICODE)ch;
1999#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002000 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002001
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002002 /* translate from 10000..10FFFF to 0..FFFF */
2003 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002004
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002005 /* high surrogate = top 10 bits added to D800 */
2006 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002007
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002008 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002009 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002010#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 break;
2012
2013 default:
2014 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002015 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016 startinpos = s-starts;
2017 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002018 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 }
2020 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002021 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002022
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002023 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002024 outpos = p-PyUnicode_AS_UNICODE(unicode);
2025 if (unicode_decode_call_errorhandler(
2026 errors, &errorHandler,
2027 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002028 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 (PyObject **)&unicode, &outpos, &p))
2030 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 }
Walter Dörwald69652032004-09-07 20:24:22 +00002032 if (consumed)
2033 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
2035 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002036 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 goto onError;
2038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039 Py_XDECREF(errorHandler);
2040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 return (PyObject *)unicode;
2042
2043onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 Py_XDECREF(errorHandler);
2045 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 Py_DECREF(unicode);
2047 return NULL;
2048}
2049
Tim Peters602f7402002-04-27 18:03:26 +00002050/* Allocation strategy: if the string is short, convert into a stack buffer
2051 and allocate exactly as much space needed at the end. Else allocate the
2052 maximum possible needed (4 result bytes per Unicode character), and return
2053 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002054*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002055PyObject *
2056PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002057 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059{
Tim Peters602f7402002-04-27 18:03:26 +00002060#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002061
Guido van Rossum98297ee2007-11-06 21:34:58 +00002062 Py_ssize_t i; /* index into s of next input byte */
2063 PyObject *result; /* result string object */
2064 char *p; /* next free byte in output buffer */
2065 Py_ssize_t nallocated; /* number of result bytes allocated */
2066 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002067 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002068
Tim Peters602f7402002-04-27 18:03:26 +00002069 assert(s != NULL);
2070 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071
Tim Peters602f7402002-04-27 18:03:26 +00002072 if (size <= MAX_SHORT_UNICHARS) {
2073 /* Write into the stack buffer; nallocated can't overflow.
2074 * At the end, we'll allocate exactly as much heap space as it
2075 * turns out we need.
2076 */
2077 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002078 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002079 p = stackbuf;
2080 }
2081 else {
2082 /* Overallocate on the heap, and give the excess back at the end. */
2083 nallocated = size * 4;
2084 if (nallocated / 4 != size) /* overflow! */
2085 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002086 result = PyString_FromStringAndSize(NULL, nallocated);
2087 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002088 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002089 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002090 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002091
Tim Peters602f7402002-04-27 18:03:26 +00002092 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002093 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002094
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002095 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002096 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002098
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002100 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002101 *p++ = (char)(0xc0 | (ch >> 6));
2102 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002103 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002104 else {
Tim Peters602f7402002-04-27 18:03:26 +00002105 /* Encode UCS2 Unicode ordinals */
2106 if (ch < 0x10000) {
2107 /* Special case: check for high surrogate */
2108 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2109 Py_UCS4 ch2 = s[i];
2110 /* Check for low surrogate and combine the two to
2111 form a UCS4 value */
2112 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002113 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002114 i++;
2115 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002116 }
Tim Peters602f7402002-04-27 18:03:26 +00002117 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002118 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002119 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002120 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2121 *p++ = (char)(0x80 | (ch & 0x3f));
2122 continue;
2123 }
2124encodeUCS4:
2125 /* Encode UCS4 Unicode ordinals */
2126 *p++ = (char)(0xf0 | (ch >> 18));
2127 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2128 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2129 *p++ = (char)(0x80 | (ch & 0x3f));
2130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002132
Guido van Rossum98297ee2007-11-06 21:34:58 +00002133 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002134 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002135 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002136 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002137 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002138 }
2139 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002140 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002141 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002142 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002143 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002144 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002145 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002146
Tim Peters602f7402002-04-27 18:03:26 +00002147#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148}
2149
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2151{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 if (!PyUnicode_Check(unicode)) {
2153 PyErr_BadArgument();
2154 return NULL;
2155 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002156 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2157 PyUnicode_GET_SIZE(unicode),
2158 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159}
2160
Walter Dörwald41980ca2007-08-16 21:55:45 +00002161/* --- UTF-32 Codec ------------------------------------------------------- */
2162
2163PyObject *
2164PyUnicode_DecodeUTF32(const char *s,
2165 Py_ssize_t size,
2166 const char *errors,
2167 int *byteorder)
2168{
2169 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2170}
2171
2172PyObject *
2173PyUnicode_DecodeUTF32Stateful(const char *s,
2174 Py_ssize_t size,
2175 const char *errors,
2176 int *byteorder,
2177 Py_ssize_t *consumed)
2178{
2179 const char *starts = s;
2180 Py_ssize_t startinpos;
2181 Py_ssize_t endinpos;
2182 Py_ssize_t outpos;
2183 PyUnicodeObject *unicode;
2184 Py_UNICODE *p;
2185#ifndef Py_UNICODE_WIDE
2186 int i, pairs;
2187#else
2188 const int pairs = 0;
2189#endif
2190 const unsigned char *q, *e;
2191 int bo = 0; /* assume native ordering by default */
2192 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002193 /* Offsets from q for retrieving bytes in the right order. */
2194#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2195 int iorder[] = {0, 1, 2, 3};
2196#else
2197 int iorder[] = {3, 2, 1, 0};
2198#endif
2199 PyObject *errorHandler = NULL;
2200 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002201 /* On narrow builds we split characters outside the BMP into two
2202 codepoints => count how much extra space we need. */
2203#ifndef Py_UNICODE_WIDE
2204 for (i = pairs = 0; i < size/4; i++)
2205 if (((Py_UCS4 *)s)[i] >= 0x10000)
2206 pairs++;
2207#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002208
2209 /* This might be one to much, because of a BOM */
2210 unicode = _PyUnicode_New((size+3)/4+pairs);
2211 if (!unicode)
2212 return NULL;
2213 if (size == 0)
2214 return (PyObject *)unicode;
2215
2216 /* Unpack UTF-32 encoded data */
2217 p = unicode->str;
2218 q = (unsigned char *)s;
2219 e = q + size;
2220
2221 if (byteorder)
2222 bo = *byteorder;
2223
2224 /* Check for BOM marks (U+FEFF) in the input and adjust current
2225 byte order setting accordingly. In native mode, the leading BOM
2226 mark is skipped, in all other modes, it is copied to the output
2227 stream as-is (giving a ZWNBSP character). */
2228 if (bo == 0) {
2229 if (size >= 4) {
2230 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2231 (q[iorder[1]] << 8) | q[iorder[0]];
2232#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2233 if (bom == 0x0000FEFF) {
2234 q += 4;
2235 bo = -1;
2236 }
2237 else if (bom == 0xFFFE0000) {
2238 q += 4;
2239 bo = 1;
2240 }
2241#else
2242 if (bom == 0x0000FEFF) {
2243 q += 4;
2244 bo = 1;
2245 }
2246 else if (bom == 0xFFFE0000) {
2247 q += 4;
2248 bo = -1;
2249 }
2250#endif
2251 }
2252 }
2253
2254 if (bo == -1) {
2255 /* force LE */
2256 iorder[0] = 0;
2257 iorder[1] = 1;
2258 iorder[2] = 2;
2259 iorder[3] = 3;
2260 }
2261 else if (bo == 1) {
2262 /* force BE */
2263 iorder[0] = 3;
2264 iorder[1] = 2;
2265 iorder[2] = 1;
2266 iorder[3] = 0;
2267 }
2268
2269 while (q < e) {
2270 Py_UCS4 ch;
2271 /* remaining bytes at the end? (size should be divisible by 4) */
2272 if (e-q<4) {
2273 if (consumed)
2274 break;
2275 errmsg = "truncated data";
2276 startinpos = ((const char *)q)-starts;
2277 endinpos = ((const char *)e)-starts;
2278 goto utf32Error;
2279 /* The remaining input chars are ignored if the callback
2280 chooses to skip the input */
2281 }
2282 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2283 (q[iorder[1]] << 8) | q[iorder[0]];
2284
2285 if (ch >= 0x110000)
2286 {
2287 errmsg = "codepoint not in range(0x110000)";
2288 startinpos = ((const char *)q)-starts;
2289 endinpos = startinpos+4;
2290 goto utf32Error;
2291 }
2292#ifndef Py_UNICODE_WIDE
2293 if (ch >= 0x10000)
2294 {
2295 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2296 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2297 }
2298 else
2299#endif
2300 *p++ = ch;
2301 q += 4;
2302 continue;
2303 utf32Error:
2304 outpos = p-PyUnicode_AS_UNICODE(unicode);
2305 if (unicode_decode_call_errorhandler(
2306 errors, &errorHandler,
2307 "utf32", errmsg,
2308 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2309 (PyObject **)&unicode, &outpos, &p))
2310 goto onError;
2311 }
2312
2313 if (byteorder)
2314 *byteorder = bo;
2315
2316 if (consumed)
2317 *consumed = (const char *)q-starts;
2318
2319 /* Adjust length */
2320 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2321 goto onError;
2322
2323 Py_XDECREF(errorHandler);
2324 Py_XDECREF(exc);
2325 return (PyObject *)unicode;
2326
2327onError:
2328 Py_DECREF(unicode);
2329 Py_XDECREF(errorHandler);
2330 Py_XDECREF(exc);
2331 return NULL;
2332}
2333
2334PyObject *
2335PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2336 Py_ssize_t size,
2337 const char *errors,
2338 int byteorder)
2339{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002340 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002341 unsigned char *p;
2342#ifndef Py_UNICODE_WIDE
2343 int i, pairs;
2344#else
2345 const int pairs = 0;
2346#endif
2347 /* Offsets from p for storing byte pairs in the right order. */
2348#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2349 int iorder[] = {0, 1, 2, 3};
2350#else
2351 int iorder[] = {3, 2, 1, 0};
2352#endif
2353
2354#define STORECHAR(CH) \
2355 do { \
2356 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2357 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2358 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2359 p[iorder[0]] = (CH) & 0xff; \
2360 p += 4; \
2361 } while(0)
2362
2363 /* In narrow builds we can output surrogate pairs as one codepoint,
2364 so we need less space. */
2365#ifndef Py_UNICODE_WIDE
2366 for (i = pairs = 0; i < size-1; i++)
2367 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2368 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2369 pairs++;
2370#endif
2371 v = PyBytes_FromStringAndSize(NULL,
2372 4 * (size - pairs + (byteorder == 0)));
2373 if (v == NULL)
2374 return NULL;
2375
2376 p = (unsigned char *)PyBytes_AS_STRING(v);
2377 if (byteorder == 0)
2378 STORECHAR(0xFEFF);
2379 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002380 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002381
2382 if (byteorder == -1) {
2383 /* force LE */
2384 iorder[0] = 0;
2385 iorder[1] = 1;
2386 iorder[2] = 2;
2387 iorder[3] = 3;
2388 }
2389 else if (byteorder == 1) {
2390 /* force BE */
2391 iorder[0] = 3;
2392 iorder[1] = 2;
2393 iorder[2] = 1;
2394 iorder[3] = 0;
2395 }
2396
2397 while (size-- > 0) {
2398 Py_UCS4 ch = *s++;
2399#ifndef Py_UNICODE_WIDE
2400 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2401 Py_UCS4 ch2 = *s;
2402 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2403 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2404 s++;
2405 size--;
2406 }
2407 }
2408#endif
2409 STORECHAR(ch);
2410 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002411
2412 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002413 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002414 Py_DECREF(v);
2415 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002416#undef STORECHAR
2417}
2418
2419PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2420{
2421 if (!PyUnicode_Check(unicode)) {
2422 PyErr_BadArgument();
2423 return NULL;
2424 }
2425 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2426 PyUnicode_GET_SIZE(unicode),
2427 NULL,
2428 0);
2429}
2430
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431/* --- UTF-16 Codec ------------------------------------------------------- */
2432
Tim Peters772747b2001-08-09 22:21:55 +00002433PyObject *
2434PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002435 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002436 const char *errors,
2437 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438{
Walter Dörwald69652032004-09-07 20:24:22 +00002439 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2440}
2441
2442PyObject *
2443PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002444 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002445 const char *errors,
2446 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002447 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002448{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002449 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002450 Py_ssize_t startinpos;
2451 Py_ssize_t endinpos;
2452 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 PyUnicodeObject *unicode;
2454 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002455 const unsigned char *q, *e;
2456 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002458 /* Offsets from q for retrieving byte pairs in the right order. */
2459#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2460 int ihi = 1, ilo = 0;
2461#else
2462 int ihi = 0, ilo = 1;
2463#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 PyObject *errorHandler = NULL;
2465 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466
2467 /* Note: size will always be longer than the resulting Unicode
2468 character count */
2469 unicode = _PyUnicode_New(size);
2470 if (!unicode)
2471 return NULL;
2472 if (size == 0)
2473 return (PyObject *)unicode;
2474
2475 /* Unpack UTF-16 encoded data */
2476 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002477 q = (unsigned char *)s;
2478 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479
2480 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002481 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002483 /* Check for BOM marks (U+FEFF) in the input and adjust current
2484 byte order setting accordingly. In native mode, the leading BOM
2485 mark is skipped, in all other modes, it is copied to the output
2486 stream as-is (giving a ZWNBSP character). */
2487 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002488 if (size >= 2) {
2489 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002490#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002491 if (bom == 0xFEFF) {
2492 q += 2;
2493 bo = -1;
2494 }
2495 else if (bom == 0xFFFE) {
2496 q += 2;
2497 bo = 1;
2498 }
Tim Petersced69f82003-09-16 20:30:58 +00002499#else
Walter Dörwald69652032004-09-07 20:24:22 +00002500 if (bom == 0xFEFF) {
2501 q += 2;
2502 bo = 1;
2503 }
2504 else if (bom == 0xFFFE) {
2505 q += 2;
2506 bo = -1;
2507 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002508#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002509 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511
Tim Peters772747b2001-08-09 22:21:55 +00002512 if (bo == -1) {
2513 /* force LE */
2514 ihi = 1;
2515 ilo = 0;
2516 }
2517 else if (bo == 1) {
2518 /* force BE */
2519 ihi = 0;
2520 ilo = 1;
2521 }
2522
2523 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002525 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002526 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002527 if (consumed)
2528 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002529 errmsg = "truncated data";
2530 startinpos = ((const char *)q)-starts;
2531 endinpos = ((const char *)e)-starts;
2532 goto utf16Error;
2533 /* The remaining input chars are ignored if the callback
2534 chooses to skip the input */
2535 }
2536 ch = (q[ihi] << 8) | q[ilo];
2537
Tim Peters772747b2001-08-09 22:21:55 +00002538 q += 2;
2539
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 if (ch < 0xD800 || ch > 0xDFFF) {
2541 *p++ = ch;
2542 continue;
2543 }
2544
2545 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002546 if (q >= e) {
2547 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548 startinpos = (((const char *)q)-2)-starts;
2549 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002550 goto utf16Error;
2551 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002552 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002553 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2554 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002555 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002556#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002557 *p++ = ch;
2558 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002559#else
2560 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002561#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002562 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002563 }
2564 else {
2565 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 startinpos = (((const char *)q)-4)-starts;
2567 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002568 goto utf16Error;
2569 }
2570
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002572 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 startinpos = (((const char *)q)-2)-starts;
2574 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002575 /* Fall through to report the error */
2576
2577 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578 outpos = p-PyUnicode_AS_UNICODE(unicode);
2579 if (unicode_decode_call_errorhandler(
2580 errors, &errorHandler,
2581 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002582 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002584 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585 }
2586
2587 if (byteorder)
2588 *byteorder = bo;
2589
Walter Dörwald69652032004-09-07 20:24:22 +00002590 if (consumed)
2591 *consumed = (const char *)q-starts;
2592
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002594 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 goto onError;
2596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002597 Py_XDECREF(errorHandler);
2598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 return (PyObject *)unicode;
2600
2601onError:
2602 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002603 Py_XDECREF(errorHandler);
2604 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 return NULL;
2606}
2607
Tim Peters772747b2001-08-09 22:21:55 +00002608PyObject *
2609PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002610 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002611 const char *errors,
2612 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002614 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002615 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002616#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002617 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002618#else
2619 const int pairs = 0;
2620#endif
Tim Peters772747b2001-08-09 22:21:55 +00002621 /* Offsets from p for storing byte pairs in the right order. */
2622#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2623 int ihi = 1, ilo = 0;
2624#else
2625 int ihi = 0, ilo = 1;
2626#endif
2627
2628#define STORECHAR(CH) \
2629 do { \
2630 p[ihi] = ((CH) >> 8) & 0xff; \
2631 p[ilo] = (CH) & 0xff; \
2632 p += 2; \
2633 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002635#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002636 for (i = pairs = 0; i < size; i++)
2637 if (s[i] >= 0x10000)
2638 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002639#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002640 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002641 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 if (v == NULL)
2643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644
Walter Dörwald3cc34522007-05-04 10:48:27 +00002645 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002647 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002648 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002649 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002650
2651 if (byteorder == -1) {
2652 /* force LE */
2653 ihi = 1;
2654 ilo = 0;
2655 }
2656 else if (byteorder == 1) {
2657 /* force BE */
2658 ihi = 0;
2659 ilo = 1;
2660 }
2661
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002662 while (size-- > 0) {
2663 Py_UNICODE ch = *s++;
2664 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002665#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002666 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002667 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2668 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002670#endif
Tim Peters772747b2001-08-09 22:21:55 +00002671 STORECHAR(ch);
2672 if (ch2)
2673 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002674 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002675
2676 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002677 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002678 Py_DECREF(v);
2679 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002680#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681}
2682
2683PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2684{
2685 if (!PyUnicode_Check(unicode)) {
2686 PyErr_BadArgument();
2687 return NULL;
2688 }
2689 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2690 PyUnicode_GET_SIZE(unicode),
2691 NULL,
2692 0);
2693}
2694
2695/* --- Unicode Escape Codec ----------------------------------------------- */
2696
Fredrik Lundh06d12682001-01-24 07:59:11 +00002697static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002700 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 const char *errors)
2702{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002703 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002704 Py_ssize_t startinpos;
2705 Py_ssize_t endinpos;
2706 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 char* message;
2712 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 PyObject *errorHandler = NULL;
2714 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002715
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 /* Escaped strings will always be longer than the resulting
2717 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 length after conversion to the true value.
2719 (but if the error callback returns a long replacement string
2720 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 v = _PyUnicode_New(size);
2722 if (v == NULL)
2723 goto onError;
2724 if (size == 0)
2725 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002729
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 while (s < end) {
2731 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002732 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734
2735 /* Non-escape characters are interpreted as Unicode ordinals */
2736 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002737 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 continue;
2739 }
2740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 /* \ - Escapes */
2743 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002744 c = *s++;
2745 if (s > end)
2746 c = '\0'; /* Invalid after \ */
2747 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748
2749 /* \x escapes */
2750 case '\n': break;
2751 case '\\': *p++ = '\\'; break;
2752 case '\'': *p++ = '\''; break;
2753 case '\"': *p++ = '\"'; break;
2754 case 'b': *p++ = '\b'; break;
2755 case 'f': *p++ = '\014'; break; /* FF */
2756 case 't': *p++ = '\t'; break;
2757 case 'n': *p++ = '\n'; break;
2758 case 'r': *p++ = '\r'; break;
2759 case 'v': *p++ = '\013'; break; /* VT */
2760 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2761
2762 /* \OOO (octal) escapes */
2763 case '0': case '1': case '2': case '3':
2764 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002765 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002766 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002767 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002768 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002769 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002771 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 break;
2773
Fredrik Lundhccc74732001-02-18 22:13:49 +00002774 /* hex escapes */
2775 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002777 digits = 2;
2778 message = "truncated \\xXX escape";
2779 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780
Fredrik Lundhccc74732001-02-18 22:13:49 +00002781 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002783 digits = 4;
2784 message = "truncated \\uXXXX escape";
2785 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786
Fredrik Lundhccc74732001-02-18 22:13:49 +00002787 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002788 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002789 digits = 8;
2790 message = "truncated \\UXXXXXXXX escape";
2791 hexescape:
2792 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 outpos = p-PyUnicode_AS_UNICODE(v);
2794 if (s+digits>end) {
2795 endinpos = size;
2796 if (unicode_decode_call_errorhandler(
2797 errors, &errorHandler,
2798 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002799 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 (PyObject **)&v, &outpos, &p))
2801 goto onError;
2802 goto nextByte;
2803 }
2804 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002805 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002806 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 endinpos = (s+i+1)-starts;
2808 if (unicode_decode_call_errorhandler(
2809 errors, &errorHandler,
2810 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002811 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002813 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002815 }
2816 chr = (chr<<4) & ~0xF;
2817 if (c >= '0' && c <= '9')
2818 chr += c - '0';
2819 else if (c >= 'a' && c <= 'f')
2820 chr += 10 + c - 'a';
2821 else
2822 chr += 10 + c - 'A';
2823 }
2824 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002825 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 /* _decoding_error will have already written into the
2827 target buffer. */
2828 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002829 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002830 /* when we get here, chr is a 32-bit unicode character */
2831 if (chr <= 0xffff)
2832 /* UCS-2 character */
2833 *p++ = (Py_UNICODE) chr;
2834 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002835 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002836 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002837#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002838 *p++ = chr;
2839#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002840 chr -= 0x10000L;
2841 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002842 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002843#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002844 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845 endinpos = s-starts;
2846 outpos = p-PyUnicode_AS_UNICODE(v);
2847 if (unicode_decode_call_errorhandler(
2848 errors, &errorHandler,
2849 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002850 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002852 goto onError;
2853 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002854 break;
2855
2856 /* \N{name} */
2857 case 'N':
2858 message = "malformed \\N character escape";
2859 if (ucnhash_CAPI == NULL) {
2860 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002861 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002862 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002863 if (m == NULL)
2864 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002865 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002866 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002867 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002868 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002869 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002870 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002871 if (ucnhash_CAPI == NULL)
2872 goto ucnhashError;
2873 }
2874 if (*s == '{') {
2875 const char *start = s+1;
2876 /* look for the closing brace */
2877 while (*s != '}' && s < end)
2878 s++;
2879 if (s > start && s < end && *s == '}') {
2880 /* found a name. look it up in the unicode database */
2881 message = "unknown Unicode character name";
2882 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002883 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002884 goto store;
2885 }
2886 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 endinpos = s-starts;
2888 outpos = p-PyUnicode_AS_UNICODE(v);
2889 if (unicode_decode_call_errorhandler(
2890 errors, &errorHandler,
2891 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002892 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002893 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002894 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002895 break;
2896
2897 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002898 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899 message = "\\ at end of string";
2900 s--;
2901 endinpos = s-starts;
2902 outpos = p-PyUnicode_AS_UNICODE(v);
2903 if (unicode_decode_call_errorhandler(
2904 errors, &errorHandler,
2905 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002906 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002908 goto onError;
2909 }
2910 else {
2911 *p++ = '\\';
2912 *p++ = (unsigned char)s[-1];
2913 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002914 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916 nextByte:
2917 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002919 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002921 Py_XDECREF(errorHandler);
2922 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002924
Fredrik Lundhccc74732001-02-18 22:13:49 +00002925ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002926 PyErr_SetString(
2927 PyExc_UnicodeError,
2928 "\\N escapes not supported (can't load unicodedata module)"
2929 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002930 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002931 Py_XDECREF(errorHandler);
2932 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002933 return NULL;
2934
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 Py_XDECREF(errorHandler);
2938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 return NULL;
2940}
2941
2942/* Return a Unicode-Escape string version of the Unicode object.
2943
2944 If quotes is true, the string is enclosed in u"" or u'' quotes as
2945 appropriate.
2946
2947*/
2948
Thomas Wouters477c8d52006-05-27 19:21:47 +00002949Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2950 Py_ssize_t size,
2951 Py_UNICODE ch)
2952{
2953 /* like wcschr, but doesn't stop at NULL characters */
2954
2955 while (size-- > 0) {
2956 if (*s == ch)
2957 return s;
2958 s++;
2959 }
2960
2961 return NULL;
2962}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002963
Walter Dörwald79e913e2007-05-12 11:08:06 +00002964static const char *hexdigits = "0123456789abcdef";
2965
2966PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2967 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002969 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971
Thomas Wouters89f507f2006-12-13 04:49:30 +00002972 /* XXX(nnorwitz): rather than over-allocating, it would be
2973 better to choose a different scheme. Perhaps scan the
2974 first N-chars of the string and allocate based on that size.
2975 */
2976 /* Initial allocation is based on the longest-possible unichr
2977 escape.
2978
2979 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2980 unichr, so in this case it's the longest unichr escape. In
2981 narrow (UTF-16) builds this is five chars per source unichr
2982 since there are two unichrs in the surrogate pair, so in narrow
2983 (UTF-16) builds it's not the longest unichr escape.
2984
2985 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2986 so in the narrow (UTF-16) build case it's the longest unichr
2987 escape.
2988 */
2989
Walter Dörwald79e913e2007-05-12 11:08:06 +00002990 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002991#ifdef Py_UNICODE_WIDE
2992 + 10*size
2993#else
2994 + 6*size
2995#endif
2996 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 if (repr == NULL)
2998 return NULL;
2999
Walter Dörwald79e913e2007-05-12 11:08:06 +00003000 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 while (size-- > 0) {
3003 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003004
Walter Dörwald79e913e2007-05-12 11:08:06 +00003005 /* Escape backslashes */
3006 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 *p++ = '\\';
3008 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003009 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003010 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003011
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003012#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003013 /* Map 21-bit characters to '\U00xxxxxx' */
3014 else if (ch >= 0x10000) {
3015 *p++ = '\\';
3016 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003017 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3018 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3019 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3020 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3021 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3022 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3023 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3024 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003025 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003026 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003027#else
3028 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003029 else if (ch >= 0xD800 && ch < 0xDC00) {
3030 Py_UNICODE ch2;
3031 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003032
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003033 ch2 = *s++;
3034 size--;
3035 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3036 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3037 *p++ = '\\';
3038 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003039 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3040 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3041 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3042 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3043 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3044 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3045 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3046 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003047 continue;
3048 }
3049 /* Fall through: isolated surrogates are copied as-is */
3050 s--;
3051 size++;
3052 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003053#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003054
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003056 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 *p++ = '\\';
3058 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003059 *p++ = hexdigits[(ch >> 12) & 0x000F];
3060 *p++ = hexdigits[(ch >> 8) & 0x000F];
3061 *p++ = hexdigits[(ch >> 4) & 0x000F];
3062 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003064
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003065 /* Map special whitespace to '\t', \n', '\r' */
3066 else if (ch == '\t') {
3067 *p++ = '\\';
3068 *p++ = 't';
3069 }
3070 else if (ch == '\n') {
3071 *p++ = '\\';
3072 *p++ = 'n';
3073 }
3074 else if (ch == '\r') {
3075 *p++ = '\\';
3076 *p++ = 'r';
3077 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003078
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003079 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003080 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003082 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003083 *p++ = hexdigits[(ch >> 4) & 0x000F];
3084 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003085 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003086
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 /* Copy everything else as-is */
3088 else
3089 *p++ = (char) ch;
3090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091
Guido van Rossum98297ee2007-11-06 21:34:58 +00003092 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3093 p - PyBytes_AS_STRING(repr));
3094 Py_DECREF(repr);
3095 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096}
3097
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3099{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003100 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 if (!PyUnicode_Check(unicode)) {
3102 PyErr_BadArgument();
3103 return NULL;
3104 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003105 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3106 PyUnicode_GET_SIZE(unicode));
3107
3108 if (!s)
3109 return NULL;
3110 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3111 PyBytes_GET_SIZE(s));
3112 Py_DECREF(s);
3113 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114}
3115
3116/* --- Raw Unicode Escape Codec ------------------------------------------- */
3117
3118PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003119 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 const char *errors)
3121{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003123 Py_ssize_t startinpos;
3124 Py_ssize_t endinpos;
3125 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 const char *end;
3129 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 PyObject *errorHandler = NULL;
3131 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003132
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 /* Escaped strings will always be longer than the resulting
3134 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003135 length after conversion to the true value. (But decoding error
3136 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 v = _PyUnicode_New(size);
3138 if (v == NULL)
3139 goto onError;
3140 if (size == 0)
3141 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 end = s + size;
3144 while (s < end) {
3145 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003146 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003148 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149
3150 /* Non-escape characters are interpreted as Unicode ordinals */
3151 if (*s != '\\') {
3152 *p++ = (unsigned char)*s++;
3153 continue;
3154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003155 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156
3157 /* \u-escapes are only interpreted iff the number of leading
3158 backslashes if odd */
3159 bs = s;
3160 for (;s < end;) {
3161 if (*s != '\\')
3162 break;
3163 *p++ = (unsigned char)*s++;
3164 }
3165 if (((s - bs) & 1) == 0 ||
3166 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003167 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 continue;
3169 }
3170 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003171 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 s++;
3173
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003174 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003176 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003178 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 endinpos = s-starts;
3180 if (unicode_decode_call_errorhandler(
3181 errors, &errorHandler,
3182 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003183 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003184 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187 }
3188 x = (x<<4) & ~0xF;
3189 if (c >= '0' && c <= '9')
3190 x += c - '0';
3191 else if (c >= 'a' && c <= 'f')
3192 x += 10 + c - 'a';
3193 else
3194 x += 10 + c - 'A';
3195 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003196#ifndef Py_UNICODE_WIDE
3197 if (x > 0x10000) {
3198 if (unicode_decode_call_errorhandler(
3199 errors, &errorHandler,
3200 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003201 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003202 (PyObject **)&v, &outpos, &p))
3203 goto onError;
3204 }
3205#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 *p++ = x;
3207 nextByte:
3208 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003210 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003211 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003212 Py_XDECREF(errorHandler);
3213 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003215
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 onError:
3217 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 Py_XDECREF(errorHandler);
3219 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 return NULL;
3221}
3222
3223PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003224 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003226 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 char *p;
3228 char *q;
3229
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003230#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003231 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003232#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003233 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003234#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 if (repr == NULL)
3236 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003237 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003238 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239
Walter Dörwald711005d2007-05-12 12:03:26 +00003240 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 while (size-- > 0) {
3242 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003243#ifdef Py_UNICODE_WIDE
3244 /* Map 32-bit characters to '\Uxxxxxxxx' */
3245 if (ch >= 0x10000) {
3246 *p++ = '\\';
3247 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003248 *p++ = hexdigits[(ch >> 28) & 0xf];
3249 *p++ = hexdigits[(ch >> 24) & 0xf];
3250 *p++ = hexdigits[(ch >> 20) & 0xf];
3251 *p++ = hexdigits[(ch >> 16) & 0xf];
3252 *p++ = hexdigits[(ch >> 12) & 0xf];
3253 *p++ = hexdigits[(ch >> 8) & 0xf];
3254 *p++ = hexdigits[(ch >> 4) & 0xf];
3255 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003256 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003257 else
3258#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 /* Map 16-bit characters to '\uxxxx' */
3260 if (ch >= 256) {
3261 *p++ = '\\';
3262 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003263 *p++ = hexdigits[(ch >> 12) & 0xf];
3264 *p++ = hexdigits[(ch >> 8) & 0xf];
3265 *p++ = hexdigits[(ch >> 4) & 0xf];
3266 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 }
3268 /* Copy everything else as-is */
3269 else
3270 *p++ = (char) ch;
3271 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003272 size = p - q;
3273
3274 done:
3275 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3276 Py_DECREF(repr);
3277 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278}
3279
3280PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3281{
Walter Dörwald711005d2007-05-12 12:03:26 +00003282 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003284 PyErr_BadArgument();
3285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003287 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3288 PyUnicode_GET_SIZE(unicode));
3289
3290 if (!s)
3291 return NULL;
3292 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3293 PyBytes_GET_SIZE(s));
3294 Py_DECREF(s);
3295 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296}
3297
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003298/* --- Unicode Internal Codec ------------------------------------------- */
3299
3300PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003301 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003302 const char *errors)
3303{
3304 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003305 Py_ssize_t startinpos;
3306 Py_ssize_t endinpos;
3307 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003308 PyUnicodeObject *v;
3309 Py_UNICODE *p;
3310 const char *end;
3311 const char *reason;
3312 PyObject *errorHandler = NULL;
3313 PyObject *exc = NULL;
3314
Neal Norwitzd43069c2006-01-08 01:12:10 +00003315#ifdef Py_UNICODE_WIDE
3316 Py_UNICODE unimax = PyUnicode_GetMax();
3317#endif
3318
Thomas Wouters89f507f2006-12-13 04:49:30 +00003319 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003320 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3321 if (v == NULL)
3322 goto onError;
3323 if (PyUnicode_GetSize((PyObject *)v) == 0)
3324 return (PyObject *)v;
3325 p = PyUnicode_AS_UNICODE(v);
3326 end = s + size;
3327
3328 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003329 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003330 /* We have to sanity check the raw data, otherwise doom looms for
3331 some malformed UCS-4 data. */
3332 if (
3333 #ifdef Py_UNICODE_WIDE
3334 *p > unimax || *p < 0 ||
3335 #endif
3336 end-s < Py_UNICODE_SIZE
3337 )
3338 {
3339 startinpos = s - starts;
3340 if (end-s < Py_UNICODE_SIZE) {
3341 endinpos = end-starts;
3342 reason = "truncated input";
3343 }
3344 else {
3345 endinpos = s - starts + Py_UNICODE_SIZE;
3346 reason = "illegal code point (> 0x10FFFF)";
3347 }
3348 outpos = p - PyUnicode_AS_UNICODE(v);
3349 if (unicode_decode_call_errorhandler(
3350 errors, &errorHandler,
3351 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003352 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003353 (PyObject **)&v, &outpos, &p)) {
3354 goto onError;
3355 }
3356 }
3357 else {
3358 p++;
3359 s += Py_UNICODE_SIZE;
3360 }
3361 }
3362
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003363 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003364 goto onError;
3365 Py_XDECREF(errorHandler);
3366 Py_XDECREF(exc);
3367 return (PyObject *)v;
3368
3369 onError:
3370 Py_XDECREF(v);
3371 Py_XDECREF(errorHandler);
3372 Py_XDECREF(exc);
3373 return NULL;
3374}
3375
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376/* --- Latin-1 Codec ------------------------------------------------------ */
3377
3378PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003379 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 const char *errors)
3381{
3382 PyUnicodeObject *v;
3383 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003384
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003386 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003387 Py_UNICODE r = *(unsigned char*)s;
3388 return PyUnicode_FromUnicode(&r, 1);
3389 }
3390
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 v = _PyUnicode_New(size);
3392 if (v == NULL)
3393 goto onError;
3394 if (size == 0)
3395 return (PyObject *)v;
3396 p = PyUnicode_AS_UNICODE(v);
3397 while (size-- > 0)
3398 *p++ = (unsigned char)*s++;
3399 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003400
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401 onError:
3402 Py_XDECREF(v);
3403 return NULL;
3404}
3405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003406/* create or adjust a UnicodeEncodeError */
3407static void make_encode_exception(PyObject **exceptionObject,
3408 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003409 const Py_UNICODE *unicode, Py_ssize_t size,
3410 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 if (*exceptionObject == NULL) {
3414 *exceptionObject = PyUnicodeEncodeError_Create(
3415 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 }
3417 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3419 goto onError;
3420 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3421 goto onError;
3422 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3423 goto onError;
3424 return;
3425 onError:
3426 Py_DECREF(*exceptionObject);
3427 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 }
3429}
3430
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431/* raises a UnicodeEncodeError */
3432static void raise_encode_exception(PyObject **exceptionObject,
3433 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003434 const Py_UNICODE *unicode, Py_ssize_t size,
3435 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 const char *reason)
3437{
3438 make_encode_exception(exceptionObject,
3439 encoding, unicode, size, startpos, endpos, reason);
3440 if (*exceptionObject != NULL)
3441 PyCodec_StrictErrors(*exceptionObject);
3442}
3443
3444/* error handling callback helper:
3445 build arguments, call the callback and check the arguments,
3446 put the result into newpos and return the replacement string, which
3447 has to be freed by the caller */
3448static PyObject *unicode_encode_call_errorhandler(const char *errors,
3449 PyObject **errorHandler,
3450 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003451 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3452 Py_ssize_t startpos, Py_ssize_t endpos,
3453 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003455 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456
3457 PyObject *restuple;
3458 PyObject *resunicode;
3459
3460 if (*errorHandler == NULL) {
3461 *errorHandler = PyCodec_LookupError(errors);
3462 if (*errorHandler == NULL)
3463 return NULL;
3464 }
3465
3466 make_encode_exception(exceptionObject,
3467 encoding, unicode, size, startpos, endpos, reason);
3468 if (*exceptionObject == NULL)
3469 return NULL;
3470
3471 restuple = PyObject_CallFunctionObjArgs(
3472 *errorHandler, *exceptionObject, NULL);
3473 if (restuple == NULL)
3474 return NULL;
3475 if (!PyTuple_Check(restuple)) {
3476 PyErr_Format(PyExc_TypeError, &argparse[4]);
3477 Py_DECREF(restuple);
3478 return NULL;
3479 }
3480 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3481 &resunicode, newpos)) {
3482 Py_DECREF(restuple);
3483 return NULL;
3484 }
3485 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003486 *newpos = size+*newpos;
3487 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003488 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003489 Py_DECREF(restuple);
3490 return NULL;
3491 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 Py_INCREF(resunicode);
3493 Py_DECREF(restuple);
3494 return resunicode;
3495}
3496
3497static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003498 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499 const char *errors,
3500 int limit)
3501{
3502 /* output object */
3503 PyObject *res;
3504 /* pointers to the beginning and end+1 of input */
3505 const Py_UNICODE *startp = p;
3506 const Py_UNICODE *endp = p + size;
3507 /* pointer to the beginning of the unencodable characters */
3508 /* const Py_UNICODE *badp = NULL; */
3509 /* pointer into the output */
3510 char *str;
3511 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003512 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003513 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3514 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 PyObject *errorHandler = NULL;
3516 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003517 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 /* the following variable is used for caching string comparisons
3519 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3520 int known_errorHandler = -1;
3521
3522 /* allocate enough for a simple encoding without
3523 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003524 if (size == 0)
3525 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003526 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003528 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003529 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 ressize = size;
3531
3532 while (p<endp) {
3533 Py_UNICODE c = *p;
3534
3535 /* can we encode this? */
3536 if (c<limit) {
3537 /* no overflow check, because we know that the space is enough */
3538 *str++ = (char)c;
3539 ++p;
3540 }
3541 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003542 Py_ssize_t unicodepos = p-startp;
3543 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003545 Py_ssize_t repsize;
3546 Py_ssize_t newpos;
3547 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 Py_UNICODE *uni2;
3549 /* startpos for collecting unencodable chars */
3550 const Py_UNICODE *collstart = p;
3551 const Py_UNICODE *collend = p;
3552 /* find all unecodable characters */
3553 while ((collend < endp) && ((*collend)>=limit))
3554 ++collend;
3555 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3556 if (known_errorHandler==-1) {
3557 if ((errors==NULL) || (!strcmp(errors, "strict")))
3558 known_errorHandler = 1;
3559 else if (!strcmp(errors, "replace"))
3560 known_errorHandler = 2;
3561 else if (!strcmp(errors, "ignore"))
3562 known_errorHandler = 3;
3563 else if (!strcmp(errors, "xmlcharrefreplace"))
3564 known_errorHandler = 4;
3565 else
3566 known_errorHandler = 0;
3567 }
3568 switch (known_errorHandler) {
3569 case 1: /* strict */
3570 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3571 goto onError;
3572 case 2: /* replace */
3573 while (collstart++<collend)
3574 *str++ = '?'; /* fall through */
3575 case 3: /* ignore */
3576 p = collend;
3577 break;
3578 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003579 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 /* determine replacement size (temporarily (mis)uses p) */
3581 for (p = collstart, repsize = 0; p < collend; ++p) {
3582 if (*p<10)
3583 repsize += 2+1+1;
3584 else if (*p<100)
3585 repsize += 2+2+1;
3586 else if (*p<1000)
3587 repsize += 2+3+1;
3588 else if (*p<10000)
3589 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003590#ifndef Py_UNICODE_WIDE
3591 else
3592 repsize += 2+5+1;
3593#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 else if (*p<100000)
3595 repsize += 2+5+1;
3596 else if (*p<1000000)
3597 repsize += 2+6+1;
3598 else
3599 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003600#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 }
3602 requiredsize = respos+repsize+(endp-collend);
3603 if (requiredsize > ressize) {
3604 if (requiredsize<2*ressize)
3605 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003606 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003608 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 ressize = requiredsize;
3610 }
3611 /* generate replacement (temporarily (mis)uses p) */
3612 for (p = collstart; p < collend; ++p) {
3613 str += sprintf(str, "&#%d;", (int)*p);
3614 }
3615 p = collend;
3616 break;
3617 default:
3618 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3619 encoding, reason, startp, size, &exc,
3620 collstart-startp, collend-startp, &newpos);
3621 if (repunicode == NULL)
3622 goto onError;
3623 /* need more space? (at least enough for what we
3624 have+the replacement+the rest of the string, so
3625 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003626 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 repsize = PyUnicode_GET_SIZE(repunicode);
3628 requiredsize = respos+repsize+(endp-collend);
3629 if (requiredsize > ressize) {
3630 if (requiredsize<2*ressize)
3631 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003632 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 Py_DECREF(repunicode);
3634 goto onError;
3635 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003636 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 ressize = requiredsize;
3638 }
3639 /* check if there is anything unencodable in the replacement
3640 and copy it to the output */
3641 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3642 c = *uni2;
3643 if (c >= limit) {
3644 raise_encode_exception(&exc, encoding, startp, size,
3645 unicodepos, unicodepos+1, reason);
3646 Py_DECREF(repunicode);
3647 goto onError;
3648 }
3649 *str = (char)c;
3650 }
3651 p = startp + newpos;
3652 Py_DECREF(repunicode);
3653 }
3654 }
3655 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003656 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3657 str - PyBytes_AS_STRING(res));
3658 onError:
3659 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_XDECREF(errorHandler);
3661 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003662 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663}
3664
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003666 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667 const char *errors)
3668{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670}
3671
3672PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3673{
3674 if (!PyUnicode_Check(unicode)) {
3675 PyErr_BadArgument();
3676 return NULL;
3677 }
3678 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3679 PyUnicode_GET_SIZE(unicode),
3680 NULL);
3681}
3682
3683/* --- 7-bit ASCII Codec -------------------------------------------------- */
3684
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003686 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 const char *errors)
3688{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 PyUnicodeObject *v;
3691 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003692 Py_ssize_t startinpos;
3693 Py_ssize_t endinpos;
3694 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 const char *e;
3696 PyObject *errorHandler = NULL;
3697 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003698
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003700 if (size == 1 && *(unsigned char*)s < 128) {
3701 Py_UNICODE r = *(unsigned char*)s;
3702 return PyUnicode_FromUnicode(&r, 1);
3703 }
Tim Petersced69f82003-09-16 20:30:58 +00003704
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 v = _PyUnicode_New(size);
3706 if (v == NULL)
3707 goto onError;
3708 if (size == 0)
3709 return (PyObject *)v;
3710 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 e = s + size;
3712 while (s < e) {
3713 register unsigned char c = (unsigned char)*s;
3714 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 ++s;
3717 }
3718 else {
3719 startinpos = s-starts;
3720 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003721 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 if (unicode_decode_call_errorhandler(
3723 errors, &errorHandler,
3724 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003725 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003730 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003731 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003732 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 Py_XDECREF(errorHandler);
3734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003736
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 onError:
3738 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 Py_XDECREF(errorHandler);
3740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 return NULL;
3742}
3743
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 const char *errors)
3747{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749}
3750
3751PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3752{
3753 if (!PyUnicode_Check(unicode)) {
3754 PyErr_BadArgument();
3755 return NULL;
3756 }
3757 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3758 PyUnicode_GET_SIZE(unicode),
3759 NULL);
3760}
3761
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003762#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003763
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003764/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003765
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003766#if SIZEOF_INT < SIZEOF_SSIZE_T
3767#define NEED_RETRY
3768#endif
3769
3770/* XXX This code is limited to "true" double-byte encodings, as
3771 a) it assumes an incomplete character consists of a single byte, and
3772 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3773 encodings, see IsDBCSLeadByteEx documentation. */
3774
3775static int is_dbcs_lead_byte(const char *s, int offset)
3776{
3777 const char *curr = s + offset;
3778
3779 if (IsDBCSLeadByte(*curr)) {
3780 const char *prev = CharPrev(s, curr);
3781 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3782 }
3783 return 0;
3784}
3785
3786/*
3787 * Decode MBCS string into unicode object. If 'final' is set, converts
3788 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3789 */
3790static int decode_mbcs(PyUnicodeObject **v,
3791 const char *s, /* MBCS string */
3792 int size, /* sizeof MBCS string */
3793 int final)
3794{
3795 Py_UNICODE *p;
3796 Py_ssize_t n = 0;
3797 int usize = 0;
3798
3799 assert(size >= 0);
3800
3801 /* Skip trailing lead-byte unless 'final' is set */
3802 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3803 --size;
3804
3805 /* First get the size of the result */
3806 if (size > 0) {
3807 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3808 if (usize == 0) {
3809 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3810 return -1;
3811 }
3812 }
3813
3814 if (*v == NULL) {
3815 /* Create unicode object */
3816 *v = _PyUnicode_New(usize);
3817 if (*v == NULL)
3818 return -1;
3819 }
3820 else {
3821 /* Extend unicode object */
3822 n = PyUnicode_GET_SIZE(*v);
3823 if (_PyUnicode_Resize(v, n + usize) < 0)
3824 return -1;
3825 }
3826
3827 /* Do the conversion */
3828 if (size > 0) {
3829 p = PyUnicode_AS_UNICODE(*v) + n;
3830 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3831 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3832 return -1;
3833 }
3834 }
3835
3836 return size;
3837}
3838
3839PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3840 Py_ssize_t size,
3841 const char *errors,
3842 Py_ssize_t *consumed)
3843{
3844 PyUnicodeObject *v = NULL;
3845 int done;
3846
3847 if (consumed)
3848 *consumed = 0;
3849
3850#ifdef NEED_RETRY
3851 retry:
3852 if (size > INT_MAX)
3853 done = decode_mbcs(&v, s, INT_MAX, 0);
3854 else
3855#endif
3856 done = decode_mbcs(&v, s, (int)size, !consumed);
3857
3858 if (done < 0) {
3859 Py_XDECREF(v);
3860 return NULL;
3861 }
3862
3863 if (consumed)
3864 *consumed += done;
3865
3866#ifdef NEED_RETRY
3867 if (size > INT_MAX) {
3868 s += done;
3869 size -= done;
3870 goto retry;
3871 }
3872#endif
3873
3874 return (PyObject *)v;
3875}
3876
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003877PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003878 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003879 const char *errors)
3880{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003881 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3882}
3883
3884/*
3885 * Convert unicode into string object (MBCS).
3886 * Returns 0 if succeed, -1 otherwise.
3887 */
3888static int encode_mbcs(PyObject **repr,
3889 const Py_UNICODE *p, /* unicode */
3890 int size) /* size of unicode */
3891{
3892 int mbcssize = 0;
3893 Py_ssize_t n = 0;
3894
3895 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003896
3897 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003898 if (size > 0) {
3899 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3900 if (mbcssize == 0) {
3901 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3902 return -1;
3903 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003904 }
3905
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003906 if (*repr == NULL) {
3907 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003908 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003909 if (*repr == NULL)
3910 return -1;
3911 }
3912 else {
3913 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003914 n = PyString_Size(*repr);
3915 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003916 return -1;
3917 }
3918
3919 /* Do the conversion */
3920 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003921 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003922 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3923 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3924 return -1;
3925 }
3926 }
3927
3928 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003929}
3930
3931PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003933 const char *errors)
3934{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003935 PyObject *repr = NULL;
3936 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003937
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003938#ifdef NEED_RETRY
3939 retry:
3940 if (size > INT_MAX)
3941 ret = encode_mbcs(&repr, p, INT_MAX);
3942 else
3943#endif
3944 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003945
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003946 if (ret < 0) {
3947 Py_XDECREF(repr);
3948 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003949 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003950
3951#ifdef NEED_RETRY
3952 if (size > INT_MAX) {
3953 p += INT_MAX;
3954 size -= INT_MAX;
3955 goto retry;
3956 }
3957#endif
3958
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003959 return repr;
3960}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003961
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003962PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3963{
3964 if (!PyUnicode_Check(unicode)) {
3965 PyErr_BadArgument();
3966 return NULL;
3967 }
3968 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3969 PyUnicode_GET_SIZE(unicode),
3970 NULL);
3971}
3972
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003973#undef NEED_RETRY
3974
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003975#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003976
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977/* --- Character Mapping Codec -------------------------------------------- */
3978
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 PyObject *mapping,
3982 const char *errors)
3983{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003985 Py_ssize_t startinpos;
3986 Py_ssize_t endinpos;
3987 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 PyUnicodeObject *v;
3990 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003991 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 PyObject *errorHandler = NULL;
3993 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003994 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003995 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003996
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 /* Default to Latin-1 */
3998 if (mapping == NULL)
3999 return PyUnicode_DecodeLatin1(s, size, errors);
4000
4001 v = _PyUnicode_New(size);
4002 if (v == NULL)
4003 goto onError;
4004 if (size == 0)
4005 return (PyObject *)v;
4006 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004008 if (PyUnicode_CheckExact(mapping)) {
4009 mapstring = PyUnicode_AS_UNICODE(mapping);
4010 maplen = PyUnicode_GET_SIZE(mapping);
4011 while (s < e) {
4012 unsigned char ch = *s;
4013 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004015 if (ch < maplen)
4016 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004018 if (x == 0xfffe) {
4019 /* undefined mapping */
4020 outpos = p-PyUnicode_AS_UNICODE(v);
4021 startinpos = s-starts;
4022 endinpos = startinpos+1;
4023 if (unicode_decode_call_errorhandler(
4024 errors, &errorHandler,
4025 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004026 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004027 (PyObject **)&v, &outpos, &p)) {
4028 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004029 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004030 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004031 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004032 *p++ = x;
4033 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004035 }
4036 else {
4037 while (s < e) {
4038 unsigned char ch = *s;
4039 PyObject *w, *x;
4040
4041 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004042 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004043 if (w == NULL)
4044 goto onError;
4045 x = PyObject_GetItem(mapping, w);
4046 Py_DECREF(w);
4047 if (x == NULL) {
4048 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4049 /* No mapping found means: mapping is undefined. */
4050 PyErr_Clear();
4051 x = Py_None;
4052 Py_INCREF(x);
4053 } else
4054 goto onError;
4055 }
4056
4057 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004058 if (PyLong_Check(x)) {
4059 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004060 if (value < 0 || value > 65535) {
4061 PyErr_SetString(PyExc_TypeError,
4062 "character mapping must be in range(65536)");
4063 Py_DECREF(x);
4064 goto onError;
4065 }
4066 *p++ = (Py_UNICODE)value;
4067 }
4068 else if (x == Py_None) {
4069 /* undefined mapping */
4070 outpos = p-PyUnicode_AS_UNICODE(v);
4071 startinpos = s-starts;
4072 endinpos = startinpos+1;
4073 if (unicode_decode_call_errorhandler(
4074 errors, &errorHandler,
4075 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004076 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004077 (PyObject **)&v, &outpos, &p)) {
4078 Py_DECREF(x);
4079 goto onError;
4080 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004081 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004082 continue;
4083 }
4084 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004085 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004086
4087 if (targetsize == 1)
4088 /* 1-1 mapping */
4089 *p++ = *PyUnicode_AS_UNICODE(x);
4090
4091 else if (targetsize > 1) {
4092 /* 1-n mapping */
4093 if (targetsize > extrachars) {
4094 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004095 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4096 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004097 (targetsize << 2);
4098 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004099 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004100 if (_PyUnicode_Resize(&v,
4101 PyUnicode_GET_SIZE(v) + needed) < 0) {
4102 Py_DECREF(x);
4103 goto onError;
4104 }
4105 p = PyUnicode_AS_UNICODE(v) + oldpos;
4106 }
4107 Py_UNICODE_COPY(p,
4108 PyUnicode_AS_UNICODE(x),
4109 targetsize);
4110 p += targetsize;
4111 extrachars -= targetsize;
4112 }
4113 /* 1-0 mapping: skip the character */
4114 }
4115 else {
4116 /* wrong return value */
4117 PyErr_SetString(PyExc_TypeError,
4118 "character mapping must return integer, None or unicode");
4119 Py_DECREF(x);
4120 goto onError;
4121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004123 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 }
4126 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004127 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 Py_XDECREF(errorHandler);
4130 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004132
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 Py_XDECREF(errorHandler);
4135 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 Py_XDECREF(v);
4137 return NULL;
4138}
4139
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004140/* Charmap encoding: the lookup table */
4141
4142struct encoding_map{
4143 PyObject_HEAD
4144 unsigned char level1[32];
4145 int count2, count3;
4146 unsigned char level23[1];
4147};
4148
4149static PyObject*
4150encoding_map_size(PyObject *obj, PyObject* args)
4151{
4152 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004153 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004154 128*map->count3);
4155}
4156
4157static PyMethodDef encoding_map_methods[] = {
4158 {"size", encoding_map_size, METH_NOARGS,
4159 PyDoc_STR("Return the size (in bytes) of this object") },
4160 { 0 }
4161};
4162
4163static void
4164encoding_map_dealloc(PyObject* o)
4165{
4166 PyObject_FREE(o);
4167}
4168
4169static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004170 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004171 "EncodingMap", /*tp_name*/
4172 sizeof(struct encoding_map), /*tp_basicsize*/
4173 0, /*tp_itemsize*/
4174 /* methods */
4175 encoding_map_dealloc, /*tp_dealloc*/
4176 0, /*tp_print*/
4177 0, /*tp_getattr*/
4178 0, /*tp_setattr*/
4179 0, /*tp_compare*/
4180 0, /*tp_repr*/
4181 0, /*tp_as_number*/
4182 0, /*tp_as_sequence*/
4183 0, /*tp_as_mapping*/
4184 0, /*tp_hash*/
4185 0, /*tp_call*/
4186 0, /*tp_str*/
4187 0, /*tp_getattro*/
4188 0, /*tp_setattro*/
4189 0, /*tp_as_buffer*/
4190 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4191 0, /*tp_doc*/
4192 0, /*tp_traverse*/
4193 0, /*tp_clear*/
4194 0, /*tp_richcompare*/
4195 0, /*tp_weaklistoffset*/
4196 0, /*tp_iter*/
4197 0, /*tp_iternext*/
4198 encoding_map_methods, /*tp_methods*/
4199 0, /*tp_members*/
4200 0, /*tp_getset*/
4201 0, /*tp_base*/
4202 0, /*tp_dict*/
4203 0, /*tp_descr_get*/
4204 0, /*tp_descr_set*/
4205 0, /*tp_dictoffset*/
4206 0, /*tp_init*/
4207 0, /*tp_alloc*/
4208 0, /*tp_new*/
4209 0, /*tp_free*/
4210 0, /*tp_is_gc*/
4211};
4212
4213PyObject*
4214PyUnicode_BuildEncodingMap(PyObject* string)
4215{
4216 Py_UNICODE *decode;
4217 PyObject *result;
4218 struct encoding_map *mresult;
4219 int i;
4220 int need_dict = 0;
4221 unsigned char level1[32];
4222 unsigned char level2[512];
4223 unsigned char *mlevel1, *mlevel2, *mlevel3;
4224 int count2 = 0, count3 = 0;
4225
4226 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4227 PyErr_BadArgument();
4228 return NULL;
4229 }
4230 decode = PyUnicode_AS_UNICODE(string);
4231 memset(level1, 0xFF, sizeof level1);
4232 memset(level2, 0xFF, sizeof level2);
4233
4234 /* If there isn't a one-to-one mapping of NULL to \0,
4235 or if there are non-BMP characters, we need to use
4236 a mapping dictionary. */
4237 if (decode[0] != 0)
4238 need_dict = 1;
4239 for (i = 1; i < 256; i++) {
4240 int l1, l2;
4241 if (decode[i] == 0
4242 #ifdef Py_UNICODE_WIDE
4243 || decode[i] > 0xFFFF
4244 #endif
4245 ) {
4246 need_dict = 1;
4247 break;
4248 }
4249 if (decode[i] == 0xFFFE)
4250 /* unmapped character */
4251 continue;
4252 l1 = decode[i] >> 11;
4253 l2 = decode[i] >> 7;
4254 if (level1[l1] == 0xFF)
4255 level1[l1] = count2++;
4256 if (level2[l2] == 0xFF)
4257 level2[l2] = count3++;
4258 }
4259
4260 if (count2 >= 0xFF || count3 >= 0xFF)
4261 need_dict = 1;
4262
4263 if (need_dict) {
4264 PyObject *result = PyDict_New();
4265 PyObject *key, *value;
4266 if (!result)
4267 return NULL;
4268 for (i = 0; i < 256; i++) {
4269 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004270 key = PyLong_FromLong(decode[i]);
4271 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004272 if (!key || !value)
4273 goto failed1;
4274 if (PyDict_SetItem(result, key, value) == -1)
4275 goto failed1;
4276 Py_DECREF(key);
4277 Py_DECREF(value);
4278 }
4279 return result;
4280 failed1:
4281 Py_XDECREF(key);
4282 Py_XDECREF(value);
4283 Py_DECREF(result);
4284 return NULL;
4285 }
4286
4287 /* Create a three-level trie */
4288 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4289 16*count2 + 128*count3 - 1);
4290 if (!result)
4291 return PyErr_NoMemory();
4292 PyObject_Init(result, &EncodingMapType);
4293 mresult = (struct encoding_map*)result;
4294 mresult->count2 = count2;
4295 mresult->count3 = count3;
4296 mlevel1 = mresult->level1;
4297 mlevel2 = mresult->level23;
4298 mlevel3 = mresult->level23 + 16*count2;
4299 memcpy(mlevel1, level1, 32);
4300 memset(mlevel2, 0xFF, 16*count2);
4301 memset(mlevel3, 0, 128*count3);
4302 count3 = 0;
4303 for (i = 1; i < 256; i++) {
4304 int o1, o2, o3, i2, i3;
4305 if (decode[i] == 0xFFFE)
4306 /* unmapped character */
4307 continue;
4308 o1 = decode[i]>>11;
4309 o2 = (decode[i]>>7) & 0xF;
4310 i2 = 16*mlevel1[o1] + o2;
4311 if (mlevel2[i2] == 0xFF)
4312 mlevel2[i2] = count3++;
4313 o3 = decode[i] & 0x7F;
4314 i3 = 128*mlevel2[i2] + o3;
4315 mlevel3[i3] = i;
4316 }
4317 return result;
4318}
4319
4320static int
4321encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4322{
4323 struct encoding_map *map = (struct encoding_map*)mapping;
4324 int l1 = c>>11;
4325 int l2 = (c>>7) & 0xF;
4326 int l3 = c & 0x7F;
4327 int i;
4328
4329#ifdef Py_UNICODE_WIDE
4330 if (c > 0xFFFF) {
4331 return -1;
4332 }
4333#endif
4334 if (c == 0)
4335 return 0;
4336 /* level 1*/
4337 i = map->level1[l1];
4338 if (i == 0xFF) {
4339 return -1;
4340 }
4341 /* level 2*/
4342 i = map->level23[16*i+l2];
4343 if (i == 0xFF) {
4344 return -1;
4345 }
4346 /* level 3 */
4347 i = map->level23[16*map->count2 + 128*i + l3];
4348 if (i == 0) {
4349 return -1;
4350 }
4351 return i;
4352}
4353
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354/* Lookup the character ch in the mapping. If the character
4355 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004356 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358{
Christian Heimes217cfd12007-12-02 14:31:20 +00004359 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 PyObject *x;
4361
4362 if (w == NULL)
4363 return NULL;
4364 x = PyObject_GetItem(mapping, w);
4365 Py_DECREF(w);
4366 if (x == NULL) {
4367 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4368 /* No mapping found means: mapping is undefined. */
4369 PyErr_Clear();
4370 x = Py_None;
4371 Py_INCREF(x);
4372 return x;
4373 } else
4374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004376 else if (x == Py_None)
4377 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004378 else if (PyLong_Check(x)) {
4379 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 if (value < 0 || value > 255) {
4381 PyErr_SetString(PyExc_TypeError,
4382 "character mapping must be in range(256)");
4383 Py_DECREF(x);
4384 return NULL;
4385 }
4386 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 else if (PyString_Check(x))
4389 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004392 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004393 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004394 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 Py_DECREF(x);
4396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 }
4398}
4399
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004400static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004401charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004402{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004403 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004404 /* exponentially overallocate to minimize reallocations */
4405 if (requiredsize < 2*outsize)
4406 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004407 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004408 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004409 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004410}
4411
4412typedef enum charmapencode_result {
4413 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4414}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004416 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 space is available. Return a new reference to the object that
4418 was put in the output buffer, or Py_None, if the mapping was undefined
4419 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004420 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004422charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004423 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004425 PyObject *rep;
4426 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004427 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428
Christian Heimes90aa7642007-12-19 02:45:37 +00004429 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004430 int res = encoding_map_lookup(c, mapping);
4431 Py_ssize_t requiredsize = *outpos+1;
4432 if (res == -1)
4433 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004434 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004435 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004436 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004437 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004438 outstart[(*outpos)++] = (char)res;
4439 return enc_SUCCESS;
4440 }
4441
4442 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004444 return enc_EXCEPTION;
4445 else if (rep==Py_None) {
4446 Py_DECREF(rep);
4447 return enc_FAILED;
4448 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004449 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004450 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004451 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004452 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004454 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004456 outstart = PyString_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004457 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 }
4459 else {
4460 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004461 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4462 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004463 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004464 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004466 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004468 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469 memcpy(outstart + *outpos, repchars, repsize);
4470 *outpos += repsize;
4471 }
4472 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004473 Py_DECREF(rep);
4474 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475}
4476
4477/* handle an error in PyUnicode_EncodeCharmap
4478 Return 0 on success, -1 on error */
4479static
4480int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004481 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004483 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004484 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485{
4486 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004487 Py_ssize_t repsize;
4488 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 Py_UNICODE *uni2;
4490 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004491 Py_ssize_t collstartpos = *inpos;
4492 Py_ssize_t collendpos = *inpos+1;
4493 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 char *encoding = "charmap";
4495 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004496 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 /* find all unencodable characters */
4499 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004500 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004501 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004502 int res = encoding_map_lookup(p[collendpos], mapping);
4503 if (res != -1)
4504 break;
4505 ++collendpos;
4506 continue;
4507 }
4508
4509 rep = charmapencode_lookup(p[collendpos], mapping);
4510 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004512 else if (rep!=Py_None) {
4513 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 break;
4515 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004516 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 ++collendpos;
4518 }
4519 /* cache callback name lookup
4520 * (if not done yet, i.e. it's the first error) */
4521 if (*known_errorHandler==-1) {
4522 if ((errors==NULL) || (!strcmp(errors, "strict")))
4523 *known_errorHandler = 1;
4524 else if (!strcmp(errors, "replace"))
4525 *known_errorHandler = 2;
4526 else if (!strcmp(errors, "ignore"))
4527 *known_errorHandler = 3;
4528 else if (!strcmp(errors, "xmlcharrefreplace"))
4529 *known_errorHandler = 4;
4530 else
4531 *known_errorHandler = 0;
4532 }
4533 switch (*known_errorHandler) {
4534 case 1: /* strict */
4535 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4536 return -1;
4537 case 2: /* replace */
4538 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4539 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004540 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 return -1;
4542 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004543 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4545 return -1;
4546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 }
4548 /* fall through */
4549 case 3: /* ignore */
4550 *inpos = collendpos;
4551 break;
4552 case 4: /* xmlcharrefreplace */
4553 /* generate replacement (temporarily (mis)uses p) */
4554 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4555 char buffer[2+29+1+1];
4556 char *cp;
4557 sprintf(buffer, "&#%d;", (int)p[collpos]);
4558 for (cp = buffer; *cp; ++cp) {
4559 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004560 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004562 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4564 return -1;
4565 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 }
4567 }
4568 *inpos = collendpos;
4569 break;
4570 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004571 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 encoding, reason, p, size, exceptionObject,
4573 collstartpos, collendpos, &newpos);
4574 if (repunicode == NULL)
4575 return -1;
4576 /* generate replacement */
4577 repsize = PyUnicode_GET_SIZE(repunicode);
4578 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4579 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004580 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 return -1;
4582 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004583 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4586 return -1;
4587 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 }
4589 *inpos = newpos;
4590 Py_DECREF(repunicode);
4591 }
4592 return 0;
4593}
4594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004596 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 PyObject *mapping,
4598 const char *errors)
4599{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 /* output object */
4601 PyObject *res = NULL;
4602 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004603 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004605 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 PyObject *errorHandler = NULL;
4607 PyObject *exc = NULL;
4608 /* the following variable is used for caching string comparisons
4609 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4610 * 3=ignore, 4=xmlcharrefreplace */
4611 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612
4613 /* Default to Latin-1 */
4614 if (mapping == NULL)
4615 return PyUnicode_EncodeLatin1(p, size, errors);
4616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 /* allocate enough for a simple encoding without
4618 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004619 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 if (res == NULL)
4621 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004622 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 while (inpos<size) {
4626 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004627 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004628 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004630 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 if (charmap_encoding_error(p, size, &inpos, mapping,
4632 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004633 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004634 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004635 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 else
4639 /* done with this character => adjust input position */
4640 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004644 if (respos<PyString_GET_SIZE(res))
4645 _PyString_Resize(&res, respos);
4646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 Py_XDECREF(exc);
4648 Py_XDECREF(errorHandler);
4649 return res;
4650
4651 onError:
4652 Py_XDECREF(res);
4653 Py_XDECREF(exc);
4654 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655 return NULL;
4656}
4657
4658PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4659 PyObject *mapping)
4660{
4661 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4662 PyErr_BadArgument();
4663 return NULL;
4664 }
4665 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4666 PyUnicode_GET_SIZE(unicode),
4667 mapping,
4668 NULL);
4669}
4670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004671/* create or adjust a UnicodeTranslateError */
4672static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004673 const Py_UNICODE *unicode, Py_ssize_t size,
4674 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 if (*exceptionObject == NULL) {
4678 *exceptionObject = PyUnicodeTranslateError_Create(
4679 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 }
4681 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4683 goto onError;
4684 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4685 goto onError;
4686 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4687 goto onError;
4688 return;
4689 onError:
4690 Py_DECREF(*exceptionObject);
4691 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 }
4693}
4694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695/* raises a UnicodeTranslateError */
4696static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 const Py_UNICODE *unicode, Py_ssize_t size,
4698 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 const char *reason)
4700{
4701 make_translate_exception(exceptionObject,
4702 unicode, size, startpos, endpos, reason);
4703 if (*exceptionObject != NULL)
4704 PyCodec_StrictErrors(*exceptionObject);
4705}
4706
4707/* error handling callback helper:
4708 build arguments, call the callback and check the arguments,
4709 put the result into newpos and return the replacement string, which
4710 has to be freed by the caller */
4711static PyObject *unicode_translate_call_errorhandler(const char *errors,
4712 PyObject **errorHandler,
4713 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004714 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4715 Py_ssize_t startpos, Py_ssize_t endpos,
4716 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004718 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004720 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 PyObject *restuple;
4722 PyObject *resunicode;
4723
4724 if (*errorHandler == NULL) {
4725 *errorHandler = PyCodec_LookupError(errors);
4726 if (*errorHandler == NULL)
4727 return NULL;
4728 }
4729
4730 make_translate_exception(exceptionObject,
4731 unicode, size, startpos, endpos, reason);
4732 if (*exceptionObject == NULL)
4733 return NULL;
4734
4735 restuple = PyObject_CallFunctionObjArgs(
4736 *errorHandler, *exceptionObject, NULL);
4737 if (restuple == NULL)
4738 return NULL;
4739 if (!PyTuple_Check(restuple)) {
4740 PyErr_Format(PyExc_TypeError, &argparse[4]);
4741 Py_DECREF(restuple);
4742 return NULL;
4743 }
4744 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004745 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 Py_DECREF(restuple);
4747 return NULL;
4748 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004749 if (i_newpos<0)
4750 *newpos = size+i_newpos;
4751 else
4752 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004753 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004754 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004755 Py_DECREF(restuple);
4756 return NULL;
4757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 Py_INCREF(resunicode);
4759 Py_DECREF(restuple);
4760 return resunicode;
4761}
4762
4763/* Lookup the character ch in the mapping and put the result in result,
4764 which must be decrefed by the caller.
4765 Return 0 on success, -1 on error */
4766static
4767int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4768{
Christian Heimes217cfd12007-12-02 14:31:20 +00004769 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 PyObject *x;
4771
4772 if (w == NULL)
4773 return -1;
4774 x = PyObject_GetItem(mapping, w);
4775 Py_DECREF(w);
4776 if (x == NULL) {
4777 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4778 /* No mapping found means: use 1:1 mapping. */
4779 PyErr_Clear();
4780 *result = NULL;
4781 return 0;
4782 } else
4783 return -1;
4784 }
4785 else if (x == Py_None) {
4786 *result = x;
4787 return 0;
4788 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004789 else if (PyLong_Check(x)) {
4790 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 long max = PyUnicode_GetMax();
4792 if (value < 0 || value > max) {
4793 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004794 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 Py_DECREF(x);
4796 return -1;
4797 }
4798 *result = x;
4799 return 0;
4800 }
4801 else if (PyUnicode_Check(x)) {
4802 *result = x;
4803 return 0;
4804 }
4805 else {
4806 /* wrong return value */
4807 PyErr_SetString(PyExc_TypeError,
4808 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004809 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810 return -1;
4811 }
4812}
4813/* ensure that *outobj is at least requiredsize characters long,
4814if not reallocate and adjust various state variables.
4815Return 0 on success, -1 on error */
4816static
Walter Dörwald4894c302003-10-24 14:25:28 +00004817int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004818 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004820 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004821 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004823 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004825 if (requiredsize < 2 * oldsize)
4826 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004827 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 return -1;
4829 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 }
4831 return 0;
4832}
4833/* lookup the character, put the result in the output string and adjust
4834 various state variables. Return a new reference to the object that
4835 was put in the output buffer in *result, or Py_None, if the mapping was
4836 undefined (in which case no character was written).
4837 The called must decref result.
4838 Return 0 on success, -1 on error. */
4839static
Walter Dörwald4894c302003-10-24 14:25:28 +00004840int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004841 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004842 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843{
Walter Dörwald4894c302003-10-24 14:25:28 +00004844 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 return -1;
4846 if (*res==NULL) {
4847 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004848 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 }
4850 else if (*res==Py_None)
4851 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00004852 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00004854 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 }
4856 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004857 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 if (repsize==1) {
4859 /* no overflow check, because we know that the space is enough */
4860 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4861 }
4862 else if (repsize!=0) {
4863 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004864 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004865 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004866 repsize - 1;
4867 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 return -1;
4869 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4870 *outp += repsize;
4871 }
4872 }
4873 else
4874 return -1;
4875 return 0;
4876}
4877
4878PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004879 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 PyObject *mapping,
4881 const char *errors)
4882{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 /* output object */
4884 PyObject *res = NULL;
4885 /* pointers to the beginning and end+1 of input */
4886 const Py_UNICODE *startp = p;
4887 const Py_UNICODE *endp = p + size;
4888 /* pointer into the output */
4889 Py_UNICODE *str;
4890 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004891 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 char *reason = "character maps to <undefined>";
4893 PyObject *errorHandler = NULL;
4894 PyObject *exc = NULL;
4895 /* the following variable is used for caching string comparisons
4896 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4897 * 3=ignore, 4=xmlcharrefreplace */
4898 int known_errorHandler = -1;
4899
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 if (mapping == NULL) {
4901 PyErr_BadArgument();
4902 return NULL;
4903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904
4905 /* allocate enough for a simple 1:1 translation without
4906 replacements, if we need more, we'll resize */
4907 res = PyUnicode_FromUnicode(NULL, size);
4908 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004909 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 return res;
4912 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 while (p<endp) {
4915 /* try to encode it */
4916 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004917 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 goto onError;
4920 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004921 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 if (x!=Py_None) /* it worked => adjust input pointer */
4923 ++p;
4924 else { /* untranslatable character */
4925 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004926 Py_ssize_t repsize;
4927 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 Py_UNICODE *uni2;
4929 /* startpos for collecting untranslatable chars */
4930 const Py_UNICODE *collstart = p;
4931 const Py_UNICODE *collend = p+1;
4932 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 /* find all untranslatable characters */
4935 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004936 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 goto onError;
4938 Py_XDECREF(x);
4939 if (x!=Py_None)
4940 break;
4941 ++collend;
4942 }
4943 /* cache callback name lookup
4944 * (if not done yet, i.e. it's the first error) */
4945 if (known_errorHandler==-1) {
4946 if ((errors==NULL) || (!strcmp(errors, "strict")))
4947 known_errorHandler = 1;
4948 else if (!strcmp(errors, "replace"))
4949 known_errorHandler = 2;
4950 else if (!strcmp(errors, "ignore"))
4951 known_errorHandler = 3;
4952 else if (!strcmp(errors, "xmlcharrefreplace"))
4953 known_errorHandler = 4;
4954 else
4955 known_errorHandler = 0;
4956 }
4957 switch (known_errorHandler) {
4958 case 1: /* strict */
4959 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4960 goto onError;
4961 case 2: /* replace */
4962 /* No need to check for space, this is a 1:1 replacement */
4963 for (coll = collstart; coll<collend; ++coll)
4964 *str++ = '?';
4965 /* fall through */
4966 case 3: /* ignore */
4967 p = collend;
4968 break;
4969 case 4: /* xmlcharrefreplace */
4970 /* generate replacement (temporarily (mis)uses p) */
4971 for (p = collstart; p < collend; ++p) {
4972 char buffer[2+29+1+1];
4973 char *cp;
4974 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004975 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4977 goto onError;
4978 for (cp = buffer; *cp; ++cp)
4979 *str++ = *cp;
4980 }
4981 p = collend;
4982 break;
4983 default:
4984 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4985 reason, startp, size, &exc,
4986 collstart-startp, collend-startp, &newpos);
4987 if (repunicode == NULL)
4988 goto onError;
4989 /* generate replacement */
4990 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004991 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4993 Py_DECREF(repunicode);
4994 goto onError;
4995 }
4996 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4997 *str++ = *uni2;
4998 p = startp + newpos;
4999 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 }
5001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005003 /* Resize if we allocated to much */
5004 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005005 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005006 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 }
5009 Py_XDECREF(exc);
5010 Py_XDECREF(errorHandler);
5011 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 onError:
5014 Py_XDECREF(res);
5015 Py_XDECREF(exc);
5016 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017 return NULL;
5018}
5019
5020PyObject *PyUnicode_Translate(PyObject *str,
5021 PyObject *mapping,
5022 const char *errors)
5023{
5024 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005025
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026 str = PyUnicode_FromObject(str);
5027 if (str == NULL)
5028 goto onError;
5029 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5030 PyUnicode_GET_SIZE(str),
5031 mapping,
5032 errors);
5033 Py_DECREF(str);
5034 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005035
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 onError:
5037 Py_XDECREF(str);
5038 return NULL;
5039}
Tim Petersced69f82003-09-16 20:30:58 +00005040
Guido van Rossum9e896b32000-04-05 20:11:21 +00005041/* --- Decimal Encoder ---------------------------------------------------- */
5042
5043int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005044 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005045 char *output,
5046 const char *errors)
5047{
5048 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005049 PyObject *errorHandler = NULL;
5050 PyObject *exc = NULL;
5051 const char *encoding = "decimal";
5052 const char *reason = "invalid decimal Unicode string";
5053 /* the following variable is used for caching string comparisons
5054 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5055 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005056
5057 if (output == NULL) {
5058 PyErr_BadArgument();
5059 return -1;
5060 }
5061
5062 p = s;
5063 end = s + length;
5064 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005066 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005067 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005068 Py_ssize_t repsize;
5069 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005070 Py_UNICODE *uni2;
5071 Py_UNICODE *collstart;
5072 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005073
Guido van Rossum9e896b32000-04-05 20:11:21 +00005074 if (Py_UNICODE_ISSPACE(ch)) {
5075 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005077 continue;
5078 }
5079 decimal = Py_UNICODE_TODECIMAL(ch);
5080 if (decimal >= 0) {
5081 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005083 continue;
5084 }
Guido van Rossumba477042000-04-06 18:18:10 +00005085 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005086 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005087 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005088 continue;
5089 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005090 /* All other characters are considered unencodable */
5091 collstart = p;
5092 collend = p+1;
5093 while (collend < end) {
5094 if ((0 < *collend && *collend < 256) ||
5095 !Py_UNICODE_ISSPACE(*collend) ||
5096 Py_UNICODE_TODECIMAL(*collend))
5097 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005099 /* cache callback name lookup
5100 * (if not done yet, i.e. it's the first error) */
5101 if (known_errorHandler==-1) {
5102 if ((errors==NULL) || (!strcmp(errors, "strict")))
5103 known_errorHandler = 1;
5104 else if (!strcmp(errors, "replace"))
5105 known_errorHandler = 2;
5106 else if (!strcmp(errors, "ignore"))
5107 known_errorHandler = 3;
5108 else if (!strcmp(errors, "xmlcharrefreplace"))
5109 known_errorHandler = 4;
5110 else
5111 known_errorHandler = 0;
5112 }
5113 switch (known_errorHandler) {
5114 case 1: /* strict */
5115 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5116 goto onError;
5117 case 2: /* replace */
5118 for (p = collstart; p < collend; ++p)
5119 *output++ = '?';
5120 /* fall through */
5121 case 3: /* ignore */
5122 p = collend;
5123 break;
5124 case 4: /* xmlcharrefreplace */
5125 /* generate replacement (temporarily (mis)uses p) */
5126 for (p = collstart; p < collend; ++p)
5127 output += sprintf(output, "&#%d;", (int)*p);
5128 p = collend;
5129 break;
5130 default:
5131 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5132 encoding, reason, s, length, &exc,
5133 collstart-s, collend-s, &newpos);
5134 if (repunicode == NULL)
5135 goto onError;
5136 /* generate replacement */
5137 repsize = PyUnicode_GET_SIZE(repunicode);
5138 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5139 Py_UNICODE ch = *uni2;
5140 if (Py_UNICODE_ISSPACE(ch))
5141 *output++ = ' ';
5142 else {
5143 decimal = Py_UNICODE_TODECIMAL(ch);
5144 if (decimal >= 0)
5145 *output++ = '0' + decimal;
5146 else if (0 < ch && ch < 256)
5147 *output++ = (char)ch;
5148 else {
5149 Py_DECREF(repunicode);
5150 raise_encode_exception(&exc, encoding,
5151 s, length, collstart-s, collend-s, reason);
5152 goto onError;
5153 }
5154 }
5155 }
5156 p = s + newpos;
5157 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005158 }
5159 }
5160 /* 0-terminate the output string */
5161 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 Py_XDECREF(exc);
5163 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005164 return 0;
5165
5166 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005167 Py_XDECREF(exc);
5168 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005169 return -1;
5170}
5171
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172/* --- Helpers ------------------------------------------------------------ */
5173
Eric Smith8c663262007-08-25 02:26:07 +00005174#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005175#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005176#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005177/* Include _ParseTupleFinds from find.h */
5178#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005179#include "stringlib/find.h"
5180#include "stringlib/partition.h"
5181
5182/* helper macro to fixup start/end slice values */
5183#define FIX_START_END(obj) \
5184 if (start < 0) \
5185 start += (obj)->length; \
5186 if (start < 0) \
5187 start = 0; \
5188 if (end > (obj)->length) \
5189 end = (obj)->length; \
5190 if (end < 0) \
5191 end += (obj)->length; \
5192 if (end < 0) \
5193 end = 0;
5194
Martin v. Löwis18e16552006-02-15 17:27:45 +00005195Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005196 PyObject *substr,
5197 Py_ssize_t start,
5198 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005200 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005201 PyUnicodeObject* str_obj;
5202 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005203
Thomas Wouters477c8d52006-05-27 19:21:47 +00005204 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5205 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005207 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5208 if (!sub_obj) {
5209 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 return -1;
5211 }
Tim Petersced69f82003-09-16 20:30:58 +00005212
Thomas Wouters477c8d52006-05-27 19:21:47 +00005213 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005214
Thomas Wouters477c8d52006-05-27 19:21:47 +00005215 result = stringlib_count(
5216 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5217 );
5218
5219 Py_DECREF(sub_obj);
5220 Py_DECREF(str_obj);
5221
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 return result;
5223}
5224
Martin v. Löwis18e16552006-02-15 17:27:45 +00005225Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005226 PyObject *sub,
5227 Py_ssize_t start,
5228 Py_ssize_t end,
5229 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005231 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005232
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005234 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005235 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005236 sub = PyUnicode_FromObject(sub);
5237 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005238 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005239 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 }
Tim Petersced69f82003-09-16 20:30:58 +00005241
Thomas Wouters477c8d52006-05-27 19:21:47 +00005242 if (direction > 0)
5243 result = stringlib_find_slice(
5244 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5245 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5246 start, end
5247 );
5248 else
5249 result = stringlib_rfind_slice(
5250 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5251 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5252 start, end
5253 );
5254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005256 Py_DECREF(sub);
5257
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 return result;
5259}
5260
Tim Petersced69f82003-09-16 20:30:58 +00005261static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262int tailmatch(PyUnicodeObject *self,
5263 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005264 Py_ssize_t start,
5265 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 int direction)
5267{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 if (substring->length == 0)
5269 return 1;
5270
Thomas Wouters477c8d52006-05-27 19:21:47 +00005271 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
5273 end -= substring->length;
5274 if (end < start)
5275 return 0;
5276
5277 if (direction > 0) {
5278 if (Py_UNICODE_MATCH(self, end, substring))
5279 return 1;
5280 } else {
5281 if (Py_UNICODE_MATCH(self, start, substring))
5282 return 1;
5283 }
5284
5285 return 0;
5286}
5287
Martin v. Löwis18e16552006-02-15 17:27:45 +00005288Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005290 Py_ssize_t start,
5291 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 int direction)
5293{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005294 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005295
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 str = PyUnicode_FromObject(str);
5297 if (str == NULL)
5298 return -1;
5299 substr = PyUnicode_FromObject(substr);
5300 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005301 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 return -1;
5303 }
Tim Petersced69f82003-09-16 20:30:58 +00005304
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 result = tailmatch((PyUnicodeObject *)str,
5306 (PyUnicodeObject *)substr,
5307 start, end, direction);
5308 Py_DECREF(str);
5309 Py_DECREF(substr);
5310 return result;
5311}
5312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313/* Apply fixfct filter to the Unicode object self and return a
5314 reference to the modified object */
5315
Tim Petersced69f82003-09-16 20:30:58 +00005316static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317PyObject *fixup(PyUnicodeObject *self,
5318 int (*fixfct)(PyUnicodeObject *s))
5319{
5320
5321 PyUnicodeObject *u;
5322
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005323 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 if (u == NULL)
5325 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005326
5327 Py_UNICODE_COPY(u->str, self->str, self->length);
5328
Tim Peters7a29bd52001-09-12 03:03:31 +00005329 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 /* fixfct should return TRUE if it modified the buffer. If
5331 FALSE, return a reference to the original buffer instead
5332 (to save space, not time) */
5333 Py_INCREF(self);
5334 Py_DECREF(u);
5335 return (PyObject*) self;
5336 }
5337 return (PyObject*) u;
5338}
5339
Tim Petersced69f82003-09-16 20:30:58 +00005340static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341int fixupper(PyUnicodeObject *self)
5342{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005343 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 Py_UNICODE *s = self->str;
5345 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005346
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 while (len-- > 0) {
5348 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005349
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 ch = Py_UNICODE_TOUPPER(*s);
5351 if (ch != *s) {
5352 status = 1;
5353 *s = ch;
5354 }
5355 s++;
5356 }
5357
5358 return status;
5359}
5360
Tim Petersced69f82003-09-16 20:30:58 +00005361static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362int fixlower(PyUnicodeObject *self)
5363{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005364 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 Py_UNICODE *s = self->str;
5366 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 while (len-- > 0) {
5369 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 ch = Py_UNICODE_TOLOWER(*s);
5372 if (ch != *s) {
5373 status = 1;
5374 *s = ch;
5375 }
5376 s++;
5377 }
5378
5379 return status;
5380}
5381
Tim Petersced69f82003-09-16 20:30:58 +00005382static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383int fixswapcase(PyUnicodeObject *self)
5384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 Py_UNICODE *s = self->str;
5387 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 while (len-- > 0) {
5390 if (Py_UNICODE_ISUPPER(*s)) {
5391 *s = Py_UNICODE_TOLOWER(*s);
5392 status = 1;
5393 } else if (Py_UNICODE_ISLOWER(*s)) {
5394 *s = Py_UNICODE_TOUPPER(*s);
5395 status = 1;
5396 }
5397 s++;
5398 }
5399
5400 return status;
5401}
5402
Tim Petersced69f82003-09-16 20:30:58 +00005403static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404int fixcapitalize(PyUnicodeObject *self)
5405{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005406 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005407 Py_UNICODE *s = self->str;
5408 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005409
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005410 if (len == 0)
5411 return 0;
5412 if (Py_UNICODE_ISLOWER(*s)) {
5413 *s = Py_UNICODE_TOUPPER(*s);
5414 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005416 s++;
5417 while (--len > 0) {
5418 if (Py_UNICODE_ISUPPER(*s)) {
5419 *s = Py_UNICODE_TOLOWER(*s);
5420 status = 1;
5421 }
5422 s++;
5423 }
5424 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425}
5426
5427static
5428int fixtitle(PyUnicodeObject *self)
5429{
5430 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5431 register Py_UNICODE *e;
5432 int previous_is_cased;
5433
5434 /* Shortcut for single character strings */
5435 if (PyUnicode_GET_SIZE(self) == 1) {
5436 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5437 if (*p != ch) {
5438 *p = ch;
5439 return 1;
5440 }
5441 else
5442 return 0;
5443 }
Tim Petersced69f82003-09-16 20:30:58 +00005444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 e = p + PyUnicode_GET_SIZE(self);
5446 previous_is_cased = 0;
5447 for (; p < e; p++) {
5448 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 if (previous_is_cased)
5451 *p = Py_UNICODE_TOLOWER(ch);
5452 else
5453 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005454
5455 if (Py_UNICODE_ISLOWER(ch) ||
5456 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 Py_UNICODE_ISTITLE(ch))
5458 previous_is_cased = 1;
5459 else
5460 previous_is_cased = 0;
5461 }
5462 return 1;
5463}
5464
Tim Peters8ce9f162004-08-27 01:49:32 +00005465PyObject *
5466PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467{
Tim Peters8ce9f162004-08-27 01:49:32 +00005468 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005469 const Py_UNICODE blank = ' ';
5470 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005471 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005472 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005473 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5474 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005475 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5476 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005477 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005478 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005479 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480
Tim Peters05eba1f2004-08-27 21:32:02 +00005481 fseq = PySequence_Fast(seq, "");
5482 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005483 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005484 }
5485
Tim Peters91879ab2004-08-27 22:35:44 +00005486 /* Grrrr. A codec may be invoked to convert str objects to
5487 * Unicode, and so it's possible to call back into Python code
5488 * during PyUnicode_FromObject(), and so it's possible for a sick
5489 * codec to change the size of fseq (if seq is a list). Therefore
5490 * we have to keep refetching the size -- can't assume seqlen
5491 * is invariant.
5492 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005493 seqlen = PySequence_Fast_GET_SIZE(fseq);
5494 /* If empty sequence, return u"". */
5495 if (seqlen == 0) {
5496 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5497 goto Done;
5498 }
5499 /* If singleton sequence with an exact Unicode, return that. */
5500 if (seqlen == 1) {
5501 item = PySequence_Fast_GET_ITEM(fseq, 0);
5502 if (PyUnicode_CheckExact(item)) {
5503 Py_INCREF(item);
5504 res = (PyUnicodeObject *)item;
5505 goto Done;
5506 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005507 }
5508
Tim Peters05eba1f2004-08-27 21:32:02 +00005509 /* At least two items to join, or one that isn't exact Unicode. */
5510 if (seqlen > 1) {
5511 /* Set up sep and seplen -- they're needed. */
5512 if (separator == NULL) {
5513 sep = &blank;
5514 seplen = 1;
5515 }
5516 else {
5517 internal_separator = PyUnicode_FromObject(separator);
5518 if (internal_separator == NULL)
5519 goto onError;
5520 sep = PyUnicode_AS_UNICODE(internal_separator);
5521 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005522 /* In case PyUnicode_FromObject() mutated seq. */
5523 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005524 }
5525 }
5526
5527 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005528 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005529 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005530 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005531 res_p = PyUnicode_AS_UNICODE(res);
5532 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005533
Tim Peters05eba1f2004-08-27 21:32:02 +00005534 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005535 Py_ssize_t itemlen;
5536 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005537
5538 item = PySequence_Fast_GET_ITEM(fseq, i);
5539 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005540 if (!PyUnicode_Check(item)) {
5541 PyErr_Format(PyExc_TypeError,
5542 "sequence item %zd: expected str instance,"
5543 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005544 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005545 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005546 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005547 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005548 if (item == NULL)
5549 goto onError;
5550 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005551
Tim Peters91879ab2004-08-27 22:35:44 +00005552 /* In case PyUnicode_FromObject() mutated seq. */
5553 seqlen = PySequence_Fast_GET_SIZE(fseq);
5554
Tim Peters8ce9f162004-08-27 01:49:32 +00005555 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005558 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005559 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005560 if (i < seqlen - 1) {
5561 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005562 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005563 goto Overflow;
5564 }
5565 if (new_res_used > res_alloc) {
5566 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005567 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005568 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005569 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005570 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005571 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005572 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005573 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005575 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005576 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005578
5579 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005580 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005581 res_p += itemlen;
5582 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005583 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 res_p += seplen;
5585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005587 res_used = new_res_used;
5588 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005589
Tim Peters05eba1f2004-08-27 21:32:02 +00005590 /* Shrink res to match the used area; this probably can't fail,
5591 * but it's cheap to check.
5592 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005593 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005594 goto onError;
5595
5596 Done:
5597 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005598 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 return (PyObject *)res;
5600
Tim Peters8ce9f162004-08-27 01:49:32 +00005601 Overflow:
5602 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005603 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005604 Py_DECREF(item);
5605 /* fall through */
5606
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005608 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005610 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 return NULL;
5612}
5613
Tim Petersced69f82003-09-16 20:30:58 +00005614static
5615PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005616 Py_ssize_t left,
5617 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 Py_UNICODE fill)
5619{
5620 PyUnicodeObject *u;
5621
5622 if (left < 0)
5623 left = 0;
5624 if (right < 0)
5625 right = 0;
5626
Tim Peters7a29bd52001-09-12 03:03:31 +00005627 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 Py_INCREF(self);
5629 return self;
5630 }
5631
5632 u = _PyUnicode_New(left + self->length + right);
5633 if (u) {
5634 if (left)
5635 Py_UNICODE_FILL(u->str, fill, left);
5636 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5637 if (right)
5638 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5639 }
5640
5641 return u;
5642}
5643
5644#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005645 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 if (!str) \
5647 goto onError; \
5648 if (PyList_Append(list, str)) { \
5649 Py_DECREF(str); \
5650 goto onError; \
5651 } \
5652 else \
5653 Py_DECREF(str);
5654
5655static
5656PyObject *split_whitespace(PyUnicodeObject *self,
5657 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005658 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 register Py_ssize_t i;
5661 register Py_ssize_t j;
5662 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005664 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665
5666 for (i = j = 0; i < len; ) {
5667 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005668 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 i++;
5670 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005671 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 i++;
5673 if (j < i) {
5674 if (maxcount-- <= 0)
5675 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005676 SPLIT_APPEND(buf, j, i);
5677 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 i++;
5679 j = i;
5680 }
5681 }
5682 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005683 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 }
5685 return list;
5686
5687 onError:
5688 Py_DECREF(list);
5689 return NULL;
5690}
5691
5692PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005693 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005695 register Py_ssize_t i;
5696 register Py_ssize_t j;
5697 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 PyObject *list;
5699 PyObject *str;
5700 Py_UNICODE *data;
5701
5702 string = PyUnicode_FromObject(string);
5703 if (string == NULL)
5704 return NULL;
5705 data = PyUnicode_AS_UNICODE(string);
5706 len = PyUnicode_GET_SIZE(string);
5707
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 list = PyList_New(0);
5709 if (!list)
5710 goto onError;
5711
5712 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005713 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005714
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005716 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
5719 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005720 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 if (i < len) {
5722 if (data[i] == '\r' && i + 1 < len &&
5723 data[i+1] == '\n')
5724 i += 2;
5725 else
5726 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005727 if (keepends)
5728 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 }
Guido van Rossum86662912000-04-11 15:38:46 +00005730 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 j = i;
5732 }
5733 if (j < len) {
5734 SPLIT_APPEND(data, j, len);
5735 }
5736
5737 Py_DECREF(string);
5738 return list;
5739
5740 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005741 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 Py_DECREF(string);
5743 return NULL;
5744}
5745
Tim Petersced69f82003-09-16 20:30:58 +00005746static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747PyObject *split_char(PyUnicodeObject *self,
5748 PyObject *list,
5749 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005750 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 register Py_ssize_t i;
5753 register Py_ssize_t j;
5754 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005756 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
5758 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005759 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 if (maxcount-- <= 0)
5761 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005762 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 i = j = i + 1;
5764 } else
5765 i++;
5766 }
5767 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005768 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 }
5770 return list;
5771
5772 onError:
5773 Py_DECREF(list);
5774 return NULL;
5775}
5776
Tim Petersced69f82003-09-16 20:30:58 +00005777static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778PyObject *split_substring(PyUnicodeObject *self,
5779 PyObject *list,
5780 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005781 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005783 register Py_ssize_t i;
5784 register Py_ssize_t j;
5785 Py_ssize_t len = self->length;
5786 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 PyObject *str;
5788
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005789 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 if (Py_UNICODE_MATCH(self, i, substring)) {
5791 if (maxcount-- <= 0)
5792 break;
5793 SPLIT_APPEND(self->str, j, i);
5794 i = j = i + sublen;
5795 } else
5796 i++;
5797 }
5798 if (j <= len) {
5799 SPLIT_APPEND(self->str, j, len);
5800 }
5801 return list;
5802
5803 onError:
5804 Py_DECREF(list);
5805 return NULL;
5806}
5807
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005808static
5809PyObject *rsplit_whitespace(PyUnicodeObject *self,
5810 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005811 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005812{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005813 register Py_ssize_t i;
5814 register Py_ssize_t j;
5815 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005816 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005817 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818
5819 for (i = j = len - 1; i >= 0; ) {
5820 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005821 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005822 i--;
5823 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005824 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005825 i--;
5826 if (j > i) {
5827 if (maxcount-- <= 0)
5828 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005829 SPLIT_APPEND(buf, i + 1, j + 1);
5830 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005831 i--;
5832 j = i;
5833 }
5834 }
5835 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005836 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005837 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005838 if (PyList_Reverse(list) < 0)
5839 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005840 return list;
5841
5842 onError:
5843 Py_DECREF(list);
5844 return NULL;
5845}
5846
5847static
5848PyObject *rsplit_char(PyUnicodeObject *self,
5849 PyObject *list,
5850 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005851 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005853 register Py_ssize_t i;
5854 register Py_ssize_t j;
5855 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005856 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005857 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005858
5859 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005860 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005861 if (maxcount-- <= 0)
5862 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005863 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005864 j = i = i - 1;
5865 } else
5866 i--;
5867 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005868 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005869 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005870 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005871 if (PyList_Reverse(list) < 0)
5872 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005873 return list;
5874
5875 onError:
5876 Py_DECREF(list);
5877 return NULL;
5878}
5879
5880static
5881PyObject *rsplit_substring(PyUnicodeObject *self,
5882 PyObject *list,
5883 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005884 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005885{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005886 register Py_ssize_t i;
5887 register Py_ssize_t j;
5888 Py_ssize_t len = self->length;
5889 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005890 PyObject *str;
5891
5892 for (i = len - sublen, j = len; i >= 0; ) {
5893 if (Py_UNICODE_MATCH(self, i, substring)) {
5894 if (maxcount-- <= 0)
5895 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005896 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005897 j = i;
5898 i -= sublen;
5899 } else
5900 i--;
5901 }
5902 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005903 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005904 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005905 if (PyList_Reverse(list) < 0)
5906 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005907 return list;
5908
5909 onError:
5910 Py_DECREF(list);
5911 return NULL;
5912}
5913
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914#undef SPLIT_APPEND
5915
5916static
5917PyObject *split(PyUnicodeObject *self,
5918 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005919 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920{
5921 PyObject *list;
5922
5923 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005924 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
5926 list = PyList_New(0);
5927 if (!list)
5928 return NULL;
5929
5930 if (substring == NULL)
5931 return split_whitespace(self,list,maxcount);
5932
5933 else if (substring->length == 1)
5934 return split_char(self,list,substring->str[0],maxcount);
5935
5936 else if (substring->length == 0) {
5937 Py_DECREF(list);
5938 PyErr_SetString(PyExc_ValueError, "empty separator");
5939 return NULL;
5940 }
5941 else
5942 return split_substring(self,list,substring,maxcount);
5943}
5944
Tim Petersced69f82003-09-16 20:30:58 +00005945static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005946PyObject *rsplit(PyUnicodeObject *self,
5947 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005948 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005949{
5950 PyObject *list;
5951
5952 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005953 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005954
5955 list = PyList_New(0);
5956 if (!list)
5957 return NULL;
5958
5959 if (substring == NULL)
5960 return rsplit_whitespace(self,list,maxcount);
5961
5962 else if (substring->length == 1)
5963 return rsplit_char(self,list,substring->str[0],maxcount);
5964
5965 else if (substring->length == 0) {
5966 Py_DECREF(list);
5967 PyErr_SetString(PyExc_ValueError, "empty separator");
5968 return NULL;
5969 }
5970 else
5971 return rsplit_substring(self,list,substring,maxcount);
5972}
5973
5974static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975PyObject *replace(PyUnicodeObject *self,
5976 PyUnicodeObject *str1,
5977 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005978 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979{
5980 PyUnicodeObject *u;
5981
5982 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005983 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Thomas Wouters477c8d52006-05-27 19:21:47 +00005985 if (str1->length == str2->length) {
5986 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005987 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005988 if (str1->length == 1) {
5989 /* replace characters */
5990 Py_UNICODE u1, u2;
5991 if (!findchar(self->str, self->length, str1->str[0]))
5992 goto nothing;
5993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5994 if (!u)
5995 return NULL;
5996 Py_UNICODE_COPY(u->str, self->str, self->length);
5997 u1 = str1->str[0];
5998 u2 = str2->str[0];
5999 for (i = 0; i < u->length; i++)
6000 if (u->str[i] == u1) {
6001 if (--maxcount < 0)
6002 break;
6003 u->str[i] = u2;
6004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006006 i = fastsearch(
6007 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006009 if (i < 0)
6010 goto nothing;
6011 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6012 if (!u)
6013 return NULL;
6014 Py_UNICODE_COPY(u->str, self->str, self->length);
6015 while (i <= self->length - str1->length)
6016 if (Py_UNICODE_MATCH(self, i, str1)) {
6017 if (--maxcount < 0)
6018 break;
6019 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6020 i += str1->length;
6021 } else
6022 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006025
6026 Py_ssize_t n, i, j, e;
6027 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 Py_UNICODE *p;
6029
6030 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006031 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 if (n > maxcount)
6033 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006034 if (n == 0)
6035 goto nothing;
6036 /* new_size = self->length + n * (str2->length - str1->length)); */
6037 delta = (str2->length - str1->length);
6038 if (delta == 0) {
6039 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006041 product = n * (str2->length - str1->length);
6042 if ((product / (str2->length - str1->length)) != n) {
6043 PyErr_SetString(PyExc_OverflowError,
6044 "replace string is too long");
6045 return NULL;
6046 }
6047 new_size = self->length + product;
6048 if (new_size < 0) {
6049 PyErr_SetString(PyExc_OverflowError,
6050 "replace string is too long");
6051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 }
6053 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006054 u = _PyUnicode_New(new_size);
6055 if (!u)
6056 return NULL;
6057 i = 0;
6058 p = u->str;
6059 e = self->length - str1->length;
6060 if (str1->length > 0) {
6061 while (n-- > 0) {
6062 /* look for next match */
6063 j = i;
6064 while (j <= e) {
6065 if (Py_UNICODE_MATCH(self, j, str1))
6066 break;
6067 j++;
6068 }
6069 if (j > i) {
6070 if (j > e)
6071 break;
6072 /* copy unchanged part [i:j] */
6073 Py_UNICODE_COPY(p, self->str+i, j-i);
6074 p += j - i;
6075 }
6076 /* copy substitution string */
6077 if (str2->length > 0) {
6078 Py_UNICODE_COPY(p, str2->str, str2->length);
6079 p += str2->length;
6080 }
6081 i = j + str1->length;
6082 }
6083 if (i < self->length)
6084 /* copy tail [i:] */
6085 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6086 } else {
6087 /* interleave */
6088 while (n > 0) {
6089 Py_UNICODE_COPY(p, str2->str, str2->length);
6090 p += str2->length;
6091 if (--n <= 0)
6092 break;
6093 *p++ = self->str[i++];
6094 }
6095 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006099
6100nothing:
6101 /* nothing to replace; return original string (when possible) */
6102 if (PyUnicode_CheckExact(self)) {
6103 Py_INCREF(self);
6104 return (PyObject *) self;
6105 }
6106 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107}
6108
6109/* --- Unicode Object Methods --------------------------------------------- */
6110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006111PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112"S.title() -> unicode\n\
6113\n\
6114Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006115characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
6117static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006118unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 return fixup(self, fixtitle);
6121}
6122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006123PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124"S.capitalize() -> unicode\n\
6125\n\
6126Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006127have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
6129static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006130unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 return fixup(self, fixcapitalize);
6133}
6134
6135#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006136PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137"S.capwords() -> unicode\n\
6138\n\
6139Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006140normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141
6142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006143unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144{
6145 PyObject *list;
6146 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006147 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 /* Split into words */
6150 list = split(self, NULL, -1);
6151 if (!list)
6152 return NULL;
6153
6154 /* Capitalize each word */
6155 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6156 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6157 fixcapitalize);
6158 if (item == NULL)
6159 goto onError;
6160 Py_DECREF(PyList_GET_ITEM(list, i));
6161 PyList_SET_ITEM(list, i, item);
6162 }
6163
6164 /* Join the words to form a new string */
6165 item = PyUnicode_Join(NULL, list);
6166
6167onError:
6168 Py_DECREF(list);
6169 return (PyObject *)item;
6170}
6171#endif
6172
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006173/* Argument converter. Coerces to a single unicode character */
6174
6175static int
6176convert_uc(PyObject *obj, void *addr)
6177{
6178 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6179 PyObject *uniobj;
6180 Py_UNICODE *unistr;
6181
6182 uniobj = PyUnicode_FromObject(obj);
6183 if (uniobj == NULL) {
6184 PyErr_SetString(PyExc_TypeError,
6185 "The fill character cannot be converted to Unicode");
6186 return 0;
6187 }
6188 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6189 PyErr_SetString(PyExc_TypeError,
6190 "The fill character must be exactly one character long");
6191 Py_DECREF(uniobj);
6192 return 0;
6193 }
6194 unistr = PyUnicode_AS_UNICODE(uniobj);
6195 *fillcharloc = unistr[0];
6196 Py_DECREF(uniobj);
6197 return 1;
6198}
6199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006200PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006201"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006203Return S centered in a Unicode string of length width. Padding is\n\
6204done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205
6206static PyObject *
6207unicode_center(PyUnicodeObject *self, PyObject *args)
6208{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006209 Py_ssize_t marg, left;
6210 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006211 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212
Thomas Woutersde017742006-02-16 19:34:37 +00006213 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 return NULL;
6215
Tim Peters7a29bd52001-09-12 03:03:31 +00006216 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 Py_INCREF(self);
6218 return (PyObject*) self;
6219 }
6220
6221 marg = width - self->length;
6222 left = marg / 2 + (marg & width & 1);
6223
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006224 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225}
6226
Marc-André Lemburge5034372000-08-08 08:04:29 +00006227#if 0
6228
6229/* This code should go into some future Unicode collation support
6230 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006231 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006232
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006233/* speedy UTF-16 code point order comparison */
6234/* gleaned from: */
6235/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6236
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006237static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006238{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006239 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006240 0, 0, 0, 0, 0, 0, 0, 0,
6241 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006242 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006243};
6244
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245static int
6246unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006249
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 Py_UNICODE *s1 = str1->str;
6251 Py_UNICODE *s2 = str2->str;
6252
6253 len1 = str1->length;
6254 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006255
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006257 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006258
6259 c1 = *s1++;
6260 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006261
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006262 if (c1 > (1<<11) * 26)
6263 c1 += utf16Fixup[c1>>11];
6264 if (c2 > (1<<11) * 26)
6265 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006266 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006267
6268 if (c1 != c2)
6269 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006270
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006271 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 }
6273
6274 return (len1 < len2) ? -1 : (len1 != len2);
6275}
6276
Marc-André Lemburge5034372000-08-08 08:04:29 +00006277#else
6278
6279static int
6280unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6281{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006282 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006283
6284 Py_UNICODE *s1 = str1->str;
6285 Py_UNICODE *s2 = str2->str;
6286
6287 len1 = str1->length;
6288 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006289
Marc-André Lemburge5034372000-08-08 08:04:29 +00006290 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006291 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006292
Fredrik Lundh45714e92001-06-26 16:39:36 +00006293 c1 = *s1++;
6294 c2 = *s2++;
6295
6296 if (c1 != c2)
6297 return (c1 < c2) ? -1 : 1;
6298
Marc-André Lemburge5034372000-08-08 08:04:29 +00006299 len1--; len2--;
6300 }
6301
6302 return (len1 < len2) ? -1 : (len1 != len2);
6303}
6304
6305#endif
6306
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307int PyUnicode_Compare(PyObject *left,
6308 PyObject *right)
6309{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006310 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6311 return unicode_compare((PyUnicodeObject *)left,
6312 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006313 PyErr_Format(PyExc_TypeError,
6314 "Can't compare %.100s and %.100s",
6315 left->ob_type->tp_name,
6316 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 return -1;
6318}
6319
Martin v. Löwis5b222132007-06-10 09:51:05 +00006320int
6321PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6322{
6323 int i;
6324 Py_UNICODE *id;
6325 assert(PyUnicode_Check(uni));
6326 id = PyUnicode_AS_UNICODE(uni);
6327 /* Compare Unicode string and source character set string */
6328 for (i = 0; id[i] && str[i]; i++)
6329 if (id[i] != str[i])
6330 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6331 if (id[i])
6332 return 1; /* uni is longer */
6333 if (str[i])
6334 return -1; /* str is longer */
6335 return 0;
6336}
6337
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006338PyObject *PyUnicode_RichCompare(PyObject *left,
6339 PyObject *right,
6340 int op)
6341{
6342 int result;
6343
6344 result = PyUnicode_Compare(left, right);
6345 if (result == -1 && PyErr_Occurred())
6346 goto onError;
6347
6348 /* Convert the return value to a Boolean */
6349 switch (op) {
6350 case Py_EQ:
6351 result = (result == 0);
6352 break;
6353 case Py_NE:
6354 result = (result != 0);
6355 break;
6356 case Py_LE:
6357 result = (result <= 0);
6358 break;
6359 case Py_GE:
6360 result = (result >= 0);
6361 break;
6362 case Py_LT:
6363 result = (result == -1);
6364 break;
6365 case Py_GT:
6366 result = (result == 1);
6367 break;
6368 }
6369 return PyBool_FromLong(result);
6370
6371 onError:
6372
6373 /* Standard case
6374
6375 Type errors mean that PyUnicode_FromObject() could not convert
6376 one of the arguments (usually the right hand side) to Unicode,
6377 ie. we can't handle the comparison request. However, it is
6378 possible that the other object knows a comparison method, which
6379 is why we return Py_NotImplemented to give the other object a
6380 chance.
6381
6382 */
6383 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6384 PyErr_Clear();
6385 Py_INCREF(Py_NotImplemented);
6386 return Py_NotImplemented;
6387 }
6388 if (op != Py_EQ && op != Py_NE)
6389 return NULL;
6390
6391 /* Equality comparison.
6392
6393 This is a special case: we silence any PyExc_UnicodeDecodeError
6394 and instead turn it into a PyErr_UnicodeWarning.
6395
6396 */
6397 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6398 return NULL;
6399 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006400 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6401 (op == Py_EQ) ?
6402 "Unicode equal comparison "
6403 "failed to convert both arguments to Unicode - "
6404 "interpreting them as being unequal"
6405 :
6406 "Unicode unequal comparison "
6407 "failed to convert both arguments to Unicode - "
6408 "interpreting them as being unequal",
6409 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006410 return NULL;
6411 result = (op == Py_NE);
6412 return PyBool_FromLong(result);
6413}
6414
Guido van Rossum403d68b2000-03-13 15:55:09 +00006415int PyUnicode_Contains(PyObject *container,
6416 PyObject *element)
6417{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006418 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006419 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006420
6421 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006422 sub = PyUnicode_FromObject(element);
6423 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006424 PyErr_Format(PyExc_TypeError,
6425 "'in <string>' requires string as left operand, not %s",
6426 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006427 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006428 }
6429
Thomas Wouters477c8d52006-05-27 19:21:47 +00006430 str = PyUnicode_FromObject(container);
6431 if (!str) {
6432 Py_DECREF(sub);
6433 return -1;
6434 }
6435
6436 result = stringlib_contains_obj(str, sub);
6437
6438 Py_DECREF(str);
6439 Py_DECREF(sub);
6440
Guido van Rossum403d68b2000-03-13 15:55:09 +00006441 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006442}
6443
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444/* Concat to string or Unicode object giving a new Unicode object. */
6445
6446PyObject *PyUnicode_Concat(PyObject *left,
6447 PyObject *right)
6448{
6449 PyUnicodeObject *u = NULL, *v = NULL, *w;
6450
6451 /* Coerce the two arguments */
6452 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6453 if (u == NULL)
6454 goto onError;
6455 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6456 if (v == NULL)
6457 goto onError;
6458
6459 /* Shortcuts */
6460 if (v == unicode_empty) {
6461 Py_DECREF(v);
6462 return (PyObject *)u;
6463 }
6464 if (u == unicode_empty) {
6465 Py_DECREF(u);
6466 return (PyObject *)v;
6467 }
6468
6469 /* Concat the two Unicode strings */
6470 w = _PyUnicode_New(u->length + v->length);
6471 if (w == NULL)
6472 goto onError;
6473 Py_UNICODE_COPY(w->str, u->str, u->length);
6474 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6475
6476 Py_DECREF(u);
6477 Py_DECREF(v);
6478 return (PyObject *)w;
6479
6480onError:
6481 Py_XDECREF(u);
6482 Py_XDECREF(v);
6483 return NULL;
6484}
6485
Walter Dörwald1ab83302007-05-18 17:15:44 +00006486void
6487PyUnicode_Append(PyObject **pleft, PyObject *right)
6488{
6489 PyObject *new;
6490 if (*pleft == NULL)
6491 return;
6492 if (right == NULL || !PyUnicode_Check(*pleft)) {
6493 Py_DECREF(*pleft);
6494 *pleft = NULL;
6495 return;
6496 }
6497 new = PyUnicode_Concat(*pleft, right);
6498 Py_DECREF(*pleft);
6499 *pleft = new;
6500}
6501
6502void
6503PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6504{
6505 PyUnicode_Append(pleft, right);
6506 Py_XDECREF(right);
6507}
6508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006509PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510"S.count(sub[, start[, end]]) -> int\n\
6511\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512Return the number of non-overlapping occurrences of substring sub in\n\
6513Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006514interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515
6516static PyObject *
6517unicode_count(PyUnicodeObject *self, PyObject *args)
6518{
6519 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006520 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006521 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 PyObject *result;
6523
Guido van Rossumb8872e62000-05-09 14:14:27 +00006524 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6525 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 return NULL;
6527
6528 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006529 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 if (substring == NULL)
6531 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006532
Thomas Wouters477c8d52006-05-27 19:21:47 +00006533 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534
Christian Heimes217cfd12007-12-02 14:31:20 +00006535 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006536 stringlib_count(self->str + start, end - start,
6537 substring->str, substring->length)
6538 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539
6540 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006541
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 return result;
6543}
6544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006545PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006546"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006548Encodes S using the codec registered for encoding. encoding defaults\n\
6549to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006550handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006551a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6552'xmlcharrefreplace' as well as any other name registered with\n\
6553codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554
6555static PyObject *
6556unicode_encode(PyUnicodeObject *self, PyObject *args)
6557{
6558 char *encoding = NULL;
6559 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006560 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006561
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6563 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006564 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006565 if (v == NULL)
6566 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006567 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006568 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006569 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006570 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006571 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006572 Py_DECREF(v);
6573 return NULL;
6574 }
6575 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006576
6577 onError:
6578 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006579}
6580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006581PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582"S.expandtabs([tabsize]) -> unicode\n\
6583\n\
6584Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006585If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
6587static PyObject*
6588unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6589{
6590 Py_UNICODE *e;
6591 Py_UNICODE *p;
6592 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006593 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 PyUnicodeObject *u;
6595 int tabsize = 8;
6596
6597 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6598 return NULL;
6599
Thomas Wouters7e474022000-07-16 12:04:32 +00006600 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006601 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 e = self->str + self->length;
6603 for (p = self->str; p < e; p++)
6604 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006605 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006607 if (old_j > j) {
6608 PyErr_SetString(PyExc_OverflowError,
6609 "new string is too long");
6610 return NULL;
6611 }
6612 old_j = j;
6613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 }
6615 else {
6616 j++;
6617 if (*p == '\n' || *p == '\r') {
6618 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006619 old_j = j = 0;
6620 if (i < 0) {
6621 PyErr_SetString(PyExc_OverflowError,
6622 "new string is too long");
6623 return NULL;
6624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 }
6626 }
6627
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006628 if ((i + j) < 0) {
6629 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6630 return NULL;
6631 }
6632
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 /* Second pass: create output string and fill it */
6634 u = _PyUnicode_New(i + j);
6635 if (!u)
6636 return NULL;
6637
6638 j = 0;
6639 q = u->str;
6640
6641 for (p = self->str; p < e; p++)
6642 if (*p == '\t') {
6643 if (tabsize > 0) {
6644 i = tabsize - (j % tabsize);
6645 j += i;
6646 while (i--)
6647 *q++ = ' ';
6648 }
6649 }
6650 else {
6651 j++;
6652 *q++ = *p;
6653 if (*p == '\n' || *p == '\r')
6654 j = 0;
6655 }
6656
6657 return (PyObject*) u;
6658}
6659
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006660PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661"S.find(sub [,start [,end]]) -> int\n\
6662\n\
6663Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006664such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665arguments start and end are interpreted as in slice notation.\n\
6666\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006667Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
6669static PyObject *
6670unicode_find(PyUnicodeObject *self, PyObject *args)
6671{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006672 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006673 Py_ssize_t start;
6674 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006675 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676
Christian Heimes9cd17752007-11-18 19:35:23 +00006677 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679
Thomas Wouters477c8d52006-05-27 19:21:47 +00006680 result = stringlib_find_slice(
6681 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6682 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6683 start, end
6684 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685
6686 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006687
Christian Heimes217cfd12007-12-02 14:31:20 +00006688 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689}
6690
6691static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006692unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693{
6694 if (index < 0 || index >= self->length) {
6695 PyErr_SetString(PyExc_IndexError, "string index out of range");
6696 return NULL;
6697 }
6698
6699 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6700}
6701
Guido van Rossumc2504932007-09-18 19:42:40 +00006702/* Believe it or not, this produces the same value for ASCII strings
6703 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006705unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706{
Guido van Rossumc2504932007-09-18 19:42:40 +00006707 Py_ssize_t len;
6708 Py_UNICODE *p;
6709 long x;
6710
6711 if (self->hash != -1)
6712 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006713 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006714 p = self->str;
6715 x = *p << 7;
6716 while (--len >= 0)
6717 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006718 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006719 if (x == -1)
6720 x = -2;
6721 self->hash = x;
6722 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723}
6724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006725PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726"S.index(sub [,start [,end]]) -> int\n\
6727\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006728Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
6730static PyObject *
6731unicode_index(PyUnicodeObject *self, PyObject *args)
6732{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006733 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006734 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006735 Py_ssize_t start;
6736 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737
Christian Heimes9cd17752007-11-18 19:35:23 +00006738 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
Thomas Wouters477c8d52006-05-27 19:21:47 +00006741 result = stringlib_find_slice(
6742 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6743 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6744 start, end
6745 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
6747 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006748
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 if (result < 0) {
6750 PyErr_SetString(PyExc_ValueError, "substring not found");
6751 return NULL;
6752 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006753
Christian Heimes217cfd12007-12-02 14:31:20 +00006754 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755}
6756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006757PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006758"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006760Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006761at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762
6763static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006764unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765{
6766 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6767 register const Py_UNICODE *e;
6768 int cased;
6769
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 /* Shortcut for single character strings */
6771 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006772 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006774 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006775 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006776 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006777
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 e = p + PyUnicode_GET_SIZE(self);
6779 cased = 0;
6780 for (; p < e; p++) {
6781 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006782
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006784 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 else if (!cased && Py_UNICODE_ISLOWER(ch))
6786 cased = 1;
6787 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006788 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006791PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006792"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006794Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006795at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796
6797static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006798unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799{
6800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6801 register const Py_UNICODE *e;
6802 int cased;
6803
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 /* Shortcut for single character strings */
6805 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006806 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006808 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006809 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006810 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006811
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 e = p + PyUnicode_GET_SIZE(self);
6813 cased = 0;
6814 for (; p < e; p++) {
6815 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006816
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006818 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 else if (!cased && Py_UNICODE_ISUPPER(ch))
6820 cased = 1;
6821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006825PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006828Return True if S is a titlecased string and there is at least one\n\
6829character in S, i.e. upper- and titlecase characters may only\n\
6830follow uncased characters and lowercase characters only cased ones.\n\
6831Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832
6833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006834unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835{
6836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6837 register const Py_UNICODE *e;
6838 int cased, previous_is_cased;
6839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 /* Shortcut for single character strings */
6841 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6843 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006846 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006848
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 e = p + PyUnicode_GET_SIZE(self);
6850 cased = 0;
6851 previous_is_cased = 0;
6852 for (; p < e; p++) {
6853 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006854
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6856 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006857 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 previous_is_cased = 1;
6859 cased = 1;
6860 }
6861 else if (Py_UNICODE_ISLOWER(ch)) {
6862 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006863 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 previous_is_cased = 1;
6865 cased = 1;
6866 }
6867 else
6868 previous_is_cased = 0;
6869 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871}
6872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006874"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006876Return True if all characters in S are whitespace\n\
6877and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878
6879static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006880unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881{
6882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6883 register const Py_UNICODE *e;
6884
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 /* Shortcut for single character strings */
6886 if (PyUnicode_GET_SIZE(self) == 1 &&
6887 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006888 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006890 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006891 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006893
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 e = p + PyUnicode_GET_SIZE(self);
6895 for (; p < e; p++) {
6896 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006897 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006899 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900}
6901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006902PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006903"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006904\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006905Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006906and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006907
6908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006909unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006910{
6911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6912 register const Py_UNICODE *e;
6913
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006914 /* Shortcut for single character strings */
6915 if (PyUnicode_GET_SIZE(self) == 1 &&
6916 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006917 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006918
6919 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006920 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006921 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006922
6923 e = p + PyUnicode_GET_SIZE(self);
6924 for (; p < e; p++) {
6925 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006926 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006929}
6930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006931PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006932"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006933\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006934Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006935and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006936
6937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006938unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006939{
6940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6941 register const Py_UNICODE *e;
6942
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006943 /* Shortcut for single character strings */
6944 if (PyUnicode_GET_SIZE(self) == 1 &&
6945 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006946 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006947
6948 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006949 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006950 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006951
6952 e = p + PyUnicode_GET_SIZE(self);
6953 for (; p < e; p++) {
6954 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006955 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006958}
6959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006960PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006961"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006964False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965
6966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006967unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968{
6969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6970 register const Py_UNICODE *e;
6971
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 /* Shortcut for single character strings */
6973 if (PyUnicode_GET_SIZE(self) == 1 &&
6974 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006975 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006977 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006978 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006979 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006980
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 e = p + PyUnicode_GET_SIZE(self);
6982 for (; p < e; p++) {
6983 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006984 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006986 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987}
6988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006989PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006990"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006992Return True if all characters in S are digits\n\
6993and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
6995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006996unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
6998 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6999 register const Py_UNICODE *e;
7000
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 /* Shortcut for single character strings */
7002 if (PyUnicode_GET_SIZE(self) == 1 &&
7003 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007004 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007006 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007007 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007008 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007009
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 e = p + PyUnicode_GET_SIZE(self);
7011 for (; p < e; p++) {
7012 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007013 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007015 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016}
7017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007018PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007019"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007021Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007022False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023
7024static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007025unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026{
7027 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7028 register const Py_UNICODE *e;
7029
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 /* Shortcut for single character strings */
7031 if (PyUnicode_GET_SIZE(self) == 1 &&
7032 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007033 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007035 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007036 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007037 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007038
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 e = p + PyUnicode_GET_SIZE(self);
7040 for (; p < e; p++) {
7041 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007042 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007044 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045}
7046
Martin v. Löwis47383402007-08-15 07:32:56 +00007047int
7048PyUnicode_IsIdentifier(PyObject *self)
7049{
7050 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7051 register const Py_UNICODE *e;
7052
7053 /* Special case for empty strings */
7054 if (PyUnicode_GET_SIZE(self) == 0)
7055 return 0;
7056
7057 /* PEP 3131 says that the first character must be in
7058 XID_Start and subsequent characters in XID_Continue,
7059 and for the ASCII range, the 2.x rules apply (i.e
7060 start with letters and underscore, continue with
7061 letters, digits, underscore). However, given the current
7062 definition of XID_Start and XID_Continue, it is sufficient
7063 to check just for these, except that _ must be allowed
7064 as starting an identifier. */
7065 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7066 return 0;
7067
7068 e = p + PyUnicode_GET_SIZE(self);
7069 for (p++; p < e; p++) {
7070 if (!_PyUnicode_IsXidContinue(*p))
7071 return 0;
7072 }
7073 return 1;
7074}
7075
7076PyDoc_STRVAR(isidentifier__doc__,
7077"S.isidentifier() -> bool\n\
7078\n\
7079Return True if S is a valid identifier according\n\
7080to the language definition.");
7081
7082static PyObject*
7083unicode_isidentifier(PyObject *self)
7084{
7085 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7086}
7087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007088PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089"S.join(sequence) -> unicode\n\
7090\n\
7091Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007092sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093
7094static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007095unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007097 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098}
7099
Martin v. Löwis18e16552006-02-15 17:27:45 +00007100static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101unicode_length(PyUnicodeObject *self)
7102{
7103 return self->length;
7104}
7105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007106PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007107"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108\n\
7109Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007110done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111
7112static PyObject *
7113unicode_ljust(PyUnicodeObject *self, PyObject *args)
7114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007115 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007116 Py_UNICODE fillchar = ' ';
7117
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007118 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 return NULL;
7120
Tim Peters7a29bd52001-09-12 03:03:31 +00007121 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 Py_INCREF(self);
7123 return (PyObject*) self;
7124 }
7125
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007126 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127}
7128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007129PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130"S.lower() -> unicode\n\
7131\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007132Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133
7134static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007135unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 return fixup(self, fixlower);
7138}
7139
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007140#define LEFTSTRIP 0
7141#define RIGHTSTRIP 1
7142#define BOTHSTRIP 2
7143
7144/* Arrays indexed by above */
7145static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7146
7147#define STRIPNAME(i) (stripformat[i]+3)
7148
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007149/* externally visible for str.strip(unicode) */
7150PyObject *
7151_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7152{
7153 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007154 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007155 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007156 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7157 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007158
Thomas Wouters477c8d52006-05-27 19:21:47 +00007159 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7160
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007161 i = 0;
7162 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007163 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7164 i++;
7165 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007166 }
7167
7168 j = len;
7169 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007170 do {
7171 j--;
7172 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7173 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007174 }
7175
7176 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007177 Py_INCREF(self);
7178 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007179 }
7180 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007181 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007182}
7183
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184
7185static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007186do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007188 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007189 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007190
7191 i = 0;
7192 if (striptype != RIGHTSTRIP) {
7193 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7194 i++;
7195 }
7196 }
7197
7198 j = len;
7199 if (striptype != LEFTSTRIP) {
7200 do {
7201 j--;
7202 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7203 j++;
7204 }
7205
7206 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7207 Py_INCREF(self);
7208 return (PyObject*)self;
7209 }
7210 else
7211 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212}
7213
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007214
7215static PyObject *
7216do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7217{
7218 PyObject *sep = NULL;
7219
7220 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7221 return NULL;
7222
7223 if (sep != NULL && sep != Py_None) {
7224 if (PyUnicode_Check(sep))
7225 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007226 else {
7227 PyErr_Format(PyExc_TypeError,
7228 "%s arg must be None, unicode or str",
7229 STRIPNAME(striptype));
7230 return NULL;
7231 }
7232 }
7233
7234 return do_strip(self, striptype);
7235}
7236
7237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007238PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007239"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007240\n\
7241Return a copy of the string S with leading and trailing\n\
7242whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007243If chars is given and not None, remove characters in chars instead.\n\
7244If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007245
7246static PyObject *
7247unicode_strip(PyUnicodeObject *self, PyObject *args)
7248{
7249 if (PyTuple_GET_SIZE(args) == 0)
7250 return do_strip(self, BOTHSTRIP); /* Common case */
7251 else
7252 return do_argstrip(self, BOTHSTRIP, args);
7253}
7254
7255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007256PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007257"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007258\n\
7259Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007260If chars is given and not None, remove characters in chars instead.\n\
7261If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007262
7263static PyObject *
7264unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7265{
7266 if (PyTuple_GET_SIZE(args) == 0)
7267 return do_strip(self, LEFTSTRIP); /* Common case */
7268 else
7269 return do_argstrip(self, LEFTSTRIP, args);
7270}
7271
7272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007273PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007274"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007275\n\
7276Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007277If chars is given and not None, remove characters in chars instead.\n\
7278If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007279
7280static PyObject *
7281unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7282{
7283 if (PyTuple_GET_SIZE(args) == 0)
7284 return do_strip(self, RIGHTSTRIP); /* Common case */
7285 else
7286 return do_argstrip(self, RIGHTSTRIP, args);
7287}
7288
7289
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007291unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292{
7293 PyUnicodeObject *u;
7294 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007295 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007296 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298 if (len < 0)
7299 len = 0;
7300
Tim Peters7a29bd52001-09-12 03:03:31 +00007301 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 /* no repeat, return original string */
7303 Py_INCREF(str);
7304 return (PyObject*) str;
7305 }
Tim Peters8f422462000-09-09 06:13:41 +00007306
7307 /* ensure # of chars needed doesn't overflow int and # of bytes
7308 * needed doesn't overflow size_t
7309 */
7310 nchars = len * str->length;
7311 if (len && nchars / len != str->length) {
7312 PyErr_SetString(PyExc_OverflowError,
7313 "repeated string is too long");
7314 return NULL;
7315 }
7316 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7317 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7318 PyErr_SetString(PyExc_OverflowError,
7319 "repeated string is too long");
7320 return NULL;
7321 }
7322 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323 if (!u)
7324 return NULL;
7325
7326 p = u->str;
7327
Thomas Wouters477c8d52006-05-27 19:21:47 +00007328 if (str->length == 1 && len > 0) {
7329 Py_UNICODE_FILL(p, str->str[0], len);
7330 } else {
7331 Py_ssize_t done = 0; /* number of characters copied this far */
7332 if (done < nchars) {
7333 Py_UNICODE_COPY(p, str->str, str->length);
7334 done = str->length;
7335 }
7336 while (done < nchars) {
7337 int n = (done <= nchars-done) ? done : nchars-done;
7338 Py_UNICODE_COPY(p+done, p, n);
7339 done += n;
7340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 }
7342
7343 return (PyObject*) u;
7344}
7345
7346PyObject *PyUnicode_Replace(PyObject *obj,
7347 PyObject *subobj,
7348 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007349 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350{
7351 PyObject *self;
7352 PyObject *str1;
7353 PyObject *str2;
7354 PyObject *result;
7355
7356 self = PyUnicode_FromObject(obj);
7357 if (self == NULL)
7358 return NULL;
7359 str1 = PyUnicode_FromObject(subobj);
7360 if (str1 == NULL) {
7361 Py_DECREF(self);
7362 return NULL;
7363 }
7364 str2 = PyUnicode_FromObject(replobj);
7365 if (str2 == NULL) {
7366 Py_DECREF(self);
7367 Py_DECREF(str1);
7368 return NULL;
7369 }
Tim Petersced69f82003-09-16 20:30:58 +00007370 result = replace((PyUnicodeObject *)self,
7371 (PyUnicodeObject *)str1,
7372 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373 maxcount);
7374 Py_DECREF(self);
7375 Py_DECREF(str1);
7376 Py_DECREF(str2);
7377 return result;
7378}
7379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007380PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381"S.replace (old, new[, maxsplit]) -> unicode\n\
7382\n\
7383Return a copy of S with all occurrences of substring\n\
7384old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007385given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386
7387static PyObject*
7388unicode_replace(PyUnicodeObject *self, PyObject *args)
7389{
7390 PyUnicodeObject *str1;
7391 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007392 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393 PyObject *result;
7394
Martin v. Löwis18e16552006-02-15 17:27:45 +00007395 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 return NULL;
7397 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7398 if (str1 == NULL)
7399 return NULL;
7400 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007401 if (str2 == NULL) {
7402 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
7406 result = replace(self, str1, str2, maxcount);
7407
7408 Py_DECREF(str1);
7409 Py_DECREF(str2);
7410 return result;
7411}
7412
7413static
7414PyObject *unicode_repr(PyObject *unicode)
7415{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007416 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007417 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007418 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7419 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7420
7421 /* XXX(nnorwitz): rather than over-allocating, it would be
7422 better to choose a different scheme. Perhaps scan the
7423 first N-chars of the string and allocate based on that size.
7424 */
7425 /* Initial allocation is based on the longest-possible unichr
7426 escape.
7427
7428 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7429 unichr, so in this case it's the longest unichr escape. In
7430 narrow (UTF-16) builds this is five chars per source unichr
7431 since there are two unichrs in the surrogate pair, so in narrow
7432 (UTF-16) builds it's not the longest unichr escape.
7433
7434 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7435 so in the narrow (UTF-16) build case it's the longest unichr
7436 escape.
7437 */
7438
Walter Dörwald1ab83302007-05-18 17:15:44 +00007439 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007440 2 /* quotes */
7441#ifdef Py_UNICODE_WIDE
7442 + 10*size
7443#else
7444 + 6*size
7445#endif
7446 + 1);
7447 if (repr == NULL)
7448 return NULL;
7449
Walter Dörwald1ab83302007-05-18 17:15:44 +00007450 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007451
7452 /* Add quote */
7453 *p++ = (findchar(s, size, '\'') &&
7454 !findchar(s, size, '"')) ? '"' : '\'';
7455 while (size-- > 0) {
7456 Py_UNICODE ch = *s++;
7457
7458 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007459 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007460 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007461 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007462 continue;
7463 }
7464
7465#ifdef Py_UNICODE_WIDE
7466 /* Map 21-bit characters to '\U00xxxxxx' */
7467 else if (ch >= 0x10000) {
7468 *p++ = '\\';
7469 *p++ = 'U';
7470 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7471 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7472 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7473 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7474 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7475 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7476 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7477 *p++ = hexdigits[ch & 0x0000000F];
7478 continue;
7479 }
7480#else
7481 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7482 else if (ch >= 0xD800 && ch < 0xDC00) {
7483 Py_UNICODE ch2;
7484 Py_UCS4 ucs;
7485
7486 ch2 = *s++;
7487 size--;
7488 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7489 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7490 *p++ = '\\';
7491 *p++ = 'U';
7492 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7493 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7494 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7495 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7496 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7497 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7498 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7499 *p++ = hexdigits[ucs & 0x0000000F];
7500 continue;
7501 }
7502 /* Fall through: isolated surrogates are copied as-is */
7503 s--;
7504 size++;
7505 }
7506#endif
7507
7508 /* Map 16-bit characters to '\uxxxx' */
7509 if (ch >= 256) {
7510 *p++ = '\\';
7511 *p++ = 'u';
7512 *p++ = hexdigits[(ch >> 12) & 0x000F];
7513 *p++ = hexdigits[(ch >> 8) & 0x000F];
7514 *p++ = hexdigits[(ch >> 4) & 0x000F];
7515 *p++ = hexdigits[ch & 0x000F];
7516 }
7517
7518 /* Map special whitespace to '\t', \n', '\r' */
7519 else if (ch == '\t') {
7520 *p++ = '\\';
7521 *p++ = 't';
7522 }
7523 else if (ch == '\n') {
7524 *p++ = '\\';
7525 *p++ = 'n';
7526 }
7527 else if (ch == '\r') {
7528 *p++ = '\\';
7529 *p++ = 'r';
7530 }
7531
7532 /* Map non-printable US ASCII to '\xhh' */
7533 else if (ch < ' ' || ch >= 0x7F) {
7534 *p++ = '\\';
7535 *p++ = 'x';
7536 *p++ = hexdigits[(ch >> 4) & 0x000F];
7537 *p++ = hexdigits[ch & 0x000F];
7538 }
7539
7540 /* Copy everything else as-is */
7541 else
7542 *p++ = (char) ch;
7543 }
7544 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007545 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007546
7547 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007548 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007549 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550}
7551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007552PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553"S.rfind(sub [,start [,end]]) -> int\n\
7554\n\
7555Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007556such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557arguments start and end are interpreted as in slice notation.\n\
7558\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560
7561static PyObject *
7562unicode_rfind(PyUnicodeObject *self, PyObject *args)
7563{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007564 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007565 Py_ssize_t start;
7566 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007567 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568
Christian Heimes9cd17752007-11-18 19:35:23 +00007569 if (!_ParseTupleFinds(args, &substring, &start, &end))
7570 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
Thomas Wouters477c8d52006-05-27 19:21:47 +00007572 result = stringlib_rfind_slice(
7573 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7574 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7575 start, end
7576 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007579
Christian Heimes217cfd12007-12-02 14:31:20 +00007580 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581}
7582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584"S.rindex(sub [,start [,end]]) -> int\n\
7585\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007586Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
7588static PyObject *
7589unicode_rindex(PyUnicodeObject *self, PyObject *args)
7590{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007591 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007592 Py_ssize_t start;
7593 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007594 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595
Christian Heimes9cd17752007-11-18 19:35:23 +00007596 if (!_ParseTupleFinds(args, &substring, &start, &end))
7597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598
Thomas Wouters477c8d52006-05-27 19:21:47 +00007599 result = stringlib_rfind_slice(
7600 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7601 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7602 start, end
7603 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
7605 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007606
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 if (result < 0) {
7608 PyErr_SetString(PyExc_ValueError, "substring not found");
7609 return NULL;
7610 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007611 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612}
7613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007614PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007615"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616\n\
7617Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007618done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
7620static PyObject *
7621unicode_rjust(PyUnicodeObject *self, PyObject *args)
7622{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007623 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007624 Py_UNICODE fillchar = ' ';
7625
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007626 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627 return NULL;
7628
Tim Peters7a29bd52001-09-12 03:03:31 +00007629 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 Py_INCREF(self);
7631 return (PyObject*) self;
7632 }
7633
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007634 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635}
7636
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637PyObject *PyUnicode_Split(PyObject *s,
7638 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007639 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640{
7641 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007642
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643 s = PyUnicode_FromObject(s);
7644 if (s == NULL)
7645 return NULL;
7646 if (sep != NULL) {
7647 sep = PyUnicode_FromObject(sep);
7648 if (sep == NULL) {
7649 Py_DECREF(s);
7650 return NULL;
7651 }
7652 }
7653
7654 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7655
7656 Py_DECREF(s);
7657 Py_XDECREF(sep);
7658 return result;
7659}
7660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007661PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662"S.split([sep [,maxsplit]]) -> list of strings\n\
7663\n\
7664Return a list of the words in S, using sep as the\n\
7665delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007666splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007667any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668
7669static PyObject*
7670unicode_split(PyUnicodeObject *self, PyObject *args)
7671{
7672 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007673 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674
Martin v. Löwis18e16552006-02-15 17:27:45 +00007675 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 return NULL;
7677
7678 if (substring == Py_None)
7679 return split(self, NULL, maxcount);
7680 else if (PyUnicode_Check(substring))
7681 return split(self, (PyUnicodeObject *)substring, maxcount);
7682 else
7683 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7684}
7685
Thomas Wouters477c8d52006-05-27 19:21:47 +00007686PyObject *
7687PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7688{
7689 PyObject* str_obj;
7690 PyObject* sep_obj;
7691 PyObject* out;
7692
7693 str_obj = PyUnicode_FromObject(str_in);
7694 if (!str_obj)
7695 return NULL;
7696 sep_obj = PyUnicode_FromObject(sep_in);
7697 if (!sep_obj) {
7698 Py_DECREF(str_obj);
7699 return NULL;
7700 }
7701
7702 out = stringlib_partition(
7703 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7704 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7705 );
7706
7707 Py_DECREF(sep_obj);
7708 Py_DECREF(str_obj);
7709
7710 return out;
7711}
7712
7713
7714PyObject *
7715PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7716{
7717 PyObject* str_obj;
7718 PyObject* sep_obj;
7719 PyObject* out;
7720
7721 str_obj = PyUnicode_FromObject(str_in);
7722 if (!str_obj)
7723 return NULL;
7724 sep_obj = PyUnicode_FromObject(sep_in);
7725 if (!sep_obj) {
7726 Py_DECREF(str_obj);
7727 return NULL;
7728 }
7729
7730 out = stringlib_rpartition(
7731 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7732 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7733 );
7734
7735 Py_DECREF(sep_obj);
7736 Py_DECREF(str_obj);
7737
7738 return out;
7739}
7740
7741PyDoc_STRVAR(partition__doc__,
7742"S.partition(sep) -> (head, sep, tail)\n\
7743\n\
7744Searches for the separator sep in S, and returns the part before it,\n\
7745the separator itself, and the part after it. If the separator is not\n\
7746found, returns S and two empty strings.");
7747
7748static PyObject*
7749unicode_partition(PyUnicodeObject *self, PyObject *separator)
7750{
7751 return PyUnicode_Partition((PyObject *)self, separator);
7752}
7753
7754PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007755"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007756\n\
7757Searches for the separator sep in S, starting at the end of S, and returns\n\
7758the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007759separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007760
7761static PyObject*
7762unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7763{
7764 return PyUnicode_RPartition((PyObject *)self, separator);
7765}
7766
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007767PyObject *PyUnicode_RSplit(PyObject *s,
7768 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007769 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007770{
7771 PyObject *result;
7772
7773 s = PyUnicode_FromObject(s);
7774 if (s == NULL)
7775 return NULL;
7776 if (sep != NULL) {
7777 sep = PyUnicode_FromObject(sep);
7778 if (sep == NULL) {
7779 Py_DECREF(s);
7780 return NULL;
7781 }
7782 }
7783
7784 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7785
7786 Py_DECREF(s);
7787 Py_XDECREF(sep);
7788 return result;
7789}
7790
7791PyDoc_STRVAR(rsplit__doc__,
7792"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7793\n\
7794Return a list of the words in S, using sep as the\n\
7795delimiter string, starting at the end of the string and\n\
7796working to the front. If maxsplit is given, at most maxsplit\n\
7797splits are done. If sep is not specified, any whitespace string\n\
7798is a separator.");
7799
7800static PyObject*
7801unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7802{
7803 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007804 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007805
Martin v. Löwis18e16552006-02-15 17:27:45 +00007806 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007807 return NULL;
7808
7809 if (substring == Py_None)
7810 return rsplit(self, NULL, maxcount);
7811 else if (PyUnicode_Check(substring))
7812 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7813 else
7814 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7815}
7816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007817PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007818"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819\n\
7820Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007821Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007822is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823
7824static PyObject*
7825unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7826{
Guido van Rossum86662912000-04-11 15:38:46 +00007827 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828
Guido van Rossum86662912000-04-11 15:38:46 +00007829 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 return NULL;
7831
Guido van Rossum86662912000-04-11 15:38:46 +00007832 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833}
7834
7835static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007836PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837{
Walter Dörwald346737f2007-05-31 10:44:43 +00007838 if (PyUnicode_CheckExact(self)) {
7839 Py_INCREF(self);
7840 return self;
7841 } else
7842 /* Subtype -- return genuine unicode string with the same value. */
7843 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7844 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845}
7846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007847PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848"S.swapcase() -> unicode\n\
7849\n\
7850Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007851and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852
7853static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007854unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 return fixup(self, fixswapcase);
7857}
7858
Georg Brandlceee0772007-11-27 23:48:05 +00007859PyDoc_STRVAR(maketrans__doc__,
7860"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
7861\n\
7862Return a translation table usable for str.translate().\n\
7863If there is only one argument, it must be a dictionary mapping Unicode\n\
7864ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
7865Character keys will then be converted to ordinals.\n\
7866If there are two arguments, they must be strings of equal length, and\n\
7867in the resulting dictionary, each character in x will be mapped to the\n\
7868character at the same position in y. If there is a third argument, it\n\
7869must be a string, whose characters will be mapped to None in the result.");
7870
7871static PyObject*
7872unicode_maketrans(PyUnicodeObject *null, PyObject *args)
7873{
7874 PyObject *x, *y = NULL, *z = NULL;
7875 PyObject *new = NULL, *key, *value;
7876 Py_ssize_t i = 0;
7877 int res;
7878
7879 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
7880 return NULL;
7881 new = PyDict_New();
7882 if (!new)
7883 return NULL;
7884 if (y != NULL) {
7885 /* x must be a string too, of equal length */
7886 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
7887 if (!PyUnicode_Check(x)) {
7888 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
7889 "be a string if there is a second argument");
7890 goto err;
7891 }
7892 if (PyUnicode_GET_SIZE(x) != ylen) {
7893 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
7894 "arguments must have equal length");
7895 goto err;
7896 }
7897 /* create entries for translating chars in x to those in y */
7898 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007899 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
7900 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007901 if (!key || !value)
7902 goto err;
7903 res = PyDict_SetItem(new, key, value);
7904 Py_DECREF(key);
7905 Py_DECREF(value);
7906 if (res < 0)
7907 goto err;
7908 }
7909 /* create entries for deleting chars in z */
7910 if (z != NULL) {
7911 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007912 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007913 if (!key)
7914 goto err;
7915 res = PyDict_SetItem(new, key, Py_None);
7916 Py_DECREF(key);
7917 if (res < 0)
7918 goto err;
7919 }
7920 }
7921 } else {
7922 /* x must be a dict */
7923 if (!PyDict_Check(x)) {
7924 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
7925 "to maketrans it must be a dict");
7926 goto err;
7927 }
7928 /* copy entries into the new dict, converting string keys to int keys */
7929 while (PyDict_Next(x, &i, &key, &value)) {
7930 if (PyUnicode_Check(key)) {
7931 /* convert string keys to integer keys */
7932 PyObject *newkey;
7933 if (PyUnicode_GET_SIZE(key) != 1) {
7934 PyErr_SetString(PyExc_ValueError, "string keys in translate "
7935 "table must be of length 1");
7936 goto err;
7937 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007938 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00007939 if (!newkey)
7940 goto err;
7941 res = PyDict_SetItem(new, newkey, value);
7942 Py_DECREF(newkey);
7943 if (res < 0)
7944 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00007945 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00007946 /* just keep integer keys */
7947 if (PyDict_SetItem(new, key, value) < 0)
7948 goto err;
7949 } else {
7950 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
7951 "be strings or integers");
7952 goto err;
7953 }
7954 }
7955 }
7956 return new;
7957 err:
7958 Py_DECREF(new);
7959 return NULL;
7960}
7961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007962PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963"S.translate(table) -> unicode\n\
7964\n\
7965Return a copy of the string S, where all characters have been mapped\n\
7966through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007967Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7968Unmapped characters are left untouched. Characters mapped to None\n\
7969are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970
7971static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007972unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973{
Georg Brandlceee0772007-11-27 23:48:05 +00007974 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975}
7976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007977PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978"S.upper() -> unicode\n\
7979\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007980Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981
7982static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007983unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 return fixup(self, fixupper);
7986}
7987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007988PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989"S.zfill(width) -> unicode\n\
7990\n\
7991Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007992of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993
7994static PyObject *
7995unicode_zfill(PyUnicodeObject *self, PyObject *args)
7996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007997 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 PyUnicodeObject *u;
7999
Martin v. Löwis18e16552006-02-15 17:27:45 +00008000 Py_ssize_t width;
8001 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 return NULL;
8003
8004 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008005 if (PyUnicode_CheckExact(self)) {
8006 Py_INCREF(self);
8007 return (PyObject*) self;
8008 }
8009 else
8010 return PyUnicode_FromUnicode(
8011 PyUnicode_AS_UNICODE(self),
8012 PyUnicode_GET_SIZE(self)
8013 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 }
8015
8016 fill = width - self->length;
8017
8018 u = pad(self, fill, 0, '0');
8019
Walter Dörwald068325e2002-04-15 13:36:47 +00008020 if (u == NULL)
8021 return NULL;
8022
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 if (u->str[fill] == '+' || u->str[fill] == '-') {
8024 /* move sign to beginning of string */
8025 u->str[0] = u->str[fill];
8026 u->str[fill] = '0';
8027 }
8028
8029 return (PyObject*) u;
8030}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031
8032#if 0
8033static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008034unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035{
Christian Heimes2202f872008-02-06 14:31:34 +00008036 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037}
8038#endif
8039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008040PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008041"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008043Return True if S starts with the specified prefix, False otherwise.\n\
8044With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008045With optional end, stop comparing S at that position.\n\
8046prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047
8048static PyObject *
8049unicode_startswith(PyUnicodeObject *self,
8050 PyObject *args)
8051{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008052 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008054 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008055 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008056 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008058 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008059 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008061 if (PyTuple_Check(subobj)) {
8062 Py_ssize_t i;
8063 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8064 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8065 PyTuple_GET_ITEM(subobj, i));
8066 if (substring == NULL)
8067 return NULL;
8068 result = tailmatch(self, substring, start, end, -1);
8069 Py_DECREF(substring);
8070 if (result) {
8071 Py_RETURN_TRUE;
8072 }
8073 }
8074 /* nothing matched */
8075 Py_RETURN_FALSE;
8076 }
8077 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008079 return NULL;
8080 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008082 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083}
8084
8085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008086PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008087"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008089Return True if S ends with the specified suffix, False otherwise.\n\
8090With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008091With optional end, stop comparing S at that position.\n\
8092suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093
8094static PyObject *
8095unicode_endswith(PyUnicodeObject *self,
8096 PyObject *args)
8097{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008098 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008101 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008102 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008104 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8105 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008107 if (PyTuple_Check(subobj)) {
8108 Py_ssize_t i;
8109 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8110 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8111 PyTuple_GET_ITEM(subobj, i));
8112 if (substring == NULL)
8113 return NULL;
8114 result = tailmatch(self, substring, start, end, +1);
8115 Py_DECREF(substring);
8116 if (result) {
8117 Py_RETURN_TRUE;
8118 }
8119 }
8120 Py_RETURN_FALSE;
8121 }
8122 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008124 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008126 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008128 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129}
8130
Eric Smith8c663262007-08-25 02:26:07 +00008131#include "stringlib/string_format.h"
8132
8133PyDoc_STRVAR(format__doc__,
8134"S.format(*args, **kwargs) -> unicode\n\
8135\n\
8136");
8137
Eric Smith8c663262007-08-25 02:26:07 +00008138PyDoc_STRVAR(p_format__doc__,
8139"S.__format__(format_spec) -> unicode\n\
8140\n\
8141");
8142
8143static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008144unicode_getnewargs(PyUnicodeObject *v)
8145{
8146 return Py_BuildValue("(u#)", v->str, v->length);
8147}
8148
8149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150static PyMethodDef unicode_methods[] = {
8151
8152 /* Order is according to common usage: often used methods should
8153 appear first, since lookup is done sequentially. */
8154
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008155 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8156 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8157 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008158 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008159 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8160 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8161 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8162 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8163 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8164 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8165 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008166 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008167 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8168 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8169 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008170 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008171 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8172 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8173 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008174 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008175 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008176 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008177 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008178 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8179 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8180 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8181 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8182 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8183 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8184 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8185 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8186 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8187 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8188 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8189 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8190 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8191 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008192 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008193 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008194 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8195 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008196 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8197 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008198 {"maketrans", (PyCFunction) unicode_maketrans,
8199 METH_VARARGS | METH_STATIC, maketrans__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008200#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008201 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202#endif
8203
8204#if 0
8205 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008206 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207#endif
8208
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008209 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 {NULL, NULL}
8211};
8212
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008213static PyObject *
8214unicode_mod(PyObject *v, PyObject *w)
8215{
8216 if (!PyUnicode_Check(v)) {
8217 Py_INCREF(Py_NotImplemented);
8218 return Py_NotImplemented;
8219 }
8220 return PyUnicode_Format(v, w);
8221}
8222
8223static PyNumberMethods unicode_as_number = {
8224 0, /*nb_add*/
8225 0, /*nb_subtract*/
8226 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008227 unicode_mod, /*nb_remainder*/
8228};
8229
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008231 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008232 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008233 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8234 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008235 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 0, /* sq_ass_item */
8237 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008238 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239};
8240
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008241static PyObject*
8242unicode_subscript(PyUnicodeObject* self, PyObject* item)
8243{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008244 if (PyIndex_Check(item)) {
8245 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008246 if (i == -1 && PyErr_Occurred())
8247 return NULL;
8248 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008249 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008250 return unicode_getitem(self, i);
8251 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008252 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008253 Py_UNICODE* source_buf;
8254 Py_UNICODE* result_buf;
8255 PyObject* result;
8256
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008257 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008258 &start, &stop, &step, &slicelength) < 0) {
8259 return NULL;
8260 }
8261
8262 if (slicelength <= 0) {
8263 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008264 } else if (start == 0 && step == 1 && slicelength == self->length &&
8265 PyUnicode_CheckExact(self)) {
8266 Py_INCREF(self);
8267 return (PyObject *)self;
8268 } else if (step == 1) {
8269 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008270 } else {
8271 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008272 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8273 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008274
8275 if (result_buf == NULL)
8276 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008277
8278 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8279 result_buf[i] = source_buf[cur];
8280 }
Tim Petersced69f82003-09-16 20:30:58 +00008281
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008282 result = PyUnicode_FromUnicode(result_buf, slicelength);
8283 PyMem_FREE(result_buf);
8284 return result;
8285 }
8286 } else {
8287 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8288 return NULL;
8289 }
8290}
8291
8292static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008293 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008294 (binaryfunc)unicode_subscript, /* mp_subscript */
8295 (objobjargproc)0, /* mp_ass_subscript */
8296};
8297
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299/* Helpers for PyUnicode_Format() */
8300
8301static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008302getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008304 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 if (argidx < arglen) {
8306 (*p_argidx)++;
8307 if (arglen < 0)
8308 return args;
8309 else
8310 return PyTuple_GetItem(args, argidx);
8311 }
8312 PyErr_SetString(PyExc_TypeError,
8313 "not enough arguments for format string");
8314 return NULL;
8315}
8316
Martin v. Löwis18e16552006-02-15 17:27:45 +00008317static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008318strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008320 register Py_ssize_t i;
8321 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 for (i = len - 1; i >= 0; i--)
8323 buffer[i] = (Py_UNICODE) charbuffer[i];
8324
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 return len;
8326}
8327
Neal Norwitzfc76d632006-01-10 06:03:13 +00008328static int
8329doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8330{
Tim Peters15231542006-02-16 01:08:01 +00008331 Py_ssize_t result;
8332
Neal Norwitzfc76d632006-01-10 06:03:13 +00008333 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008334 result = strtounicode(buffer, (char *)buffer);
8335 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008336}
8337
8338static int
8339longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8340{
Tim Peters15231542006-02-16 01:08:01 +00008341 Py_ssize_t result;
8342
Neal Norwitzfc76d632006-01-10 06:03:13 +00008343 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008344 result = strtounicode(buffer, (char *)buffer);
8345 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008346}
8347
Guido van Rossum078151d2002-08-11 04:24:12 +00008348/* XXX To save some code duplication, formatfloat/long/int could have been
8349 shared with stringobject.c, converting from 8-bit to Unicode after the
8350 formatting is done. */
8351
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352static int
8353formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008354 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 int flags,
8356 int prec,
8357 int type,
8358 PyObject *v)
8359{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008360 /* fmt = '%#.' + `prec` + `type`
8361 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 char fmt[20];
8363 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008364
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 x = PyFloat_AsDouble(v);
8366 if (x == -1.0 && PyErr_Occurred())
8367 return -1;
8368 if (prec < 0)
8369 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8371 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008372 /* Worst case length calc to ensure no buffer overrun:
8373
8374 'g' formats:
8375 fmt = %#.<prec>g
8376 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8377 for any double rep.)
8378 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8379
8380 'f' formats:
8381 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8382 len = 1 + 50 + 1 + prec = 52 + prec
8383
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008384 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008385 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008386
8387 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008388 if (((type == 'g' || type == 'G') &&
8389 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008390 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008391 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008392 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008393 return -1;
8394 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008395 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8396 (flags&F_ALT) ? "#" : "",
8397 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008398 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399}
8400
Tim Peters38fd5b62000-09-21 05:43:11 +00008401static PyObject*
8402formatlong(PyObject *val, int flags, int prec, int type)
8403{
8404 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008405 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008406 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008407 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008408
8409 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8410 if (!str)
8411 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008412 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008413 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008414 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008415}
8416
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417static int
8418formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008419 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 int flags,
8421 int prec,
8422 int type,
8423 PyObject *v)
8424{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008425 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008426 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8427 * + 1 + 1
8428 * = 24
8429 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008430 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008431 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 long x;
8433
Christian Heimes217cfd12007-12-02 14:31:20 +00008434 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008436 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008437 if (x < 0 && type == 'u') {
8438 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008439 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008440 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8441 sign = "-";
8442 else
8443 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008445 prec = 1;
8446
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008447 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8448 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008449 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008450 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008451 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008452 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008453 return -1;
8454 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008455
8456 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008457 (type == 'x' || type == 'X' || type == 'o')) {
8458 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008459 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008460 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008461 * - when 0 is being converted, the C standard leaves off
8462 * the '0x' or '0X', which is inconsistent with other
8463 * %#x/%#X conversions and inconsistent with Python's
8464 * hex() function
8465 * - there are platforms that violate the standard and
8466 * convert 0 with the '0x' or '0X'
8467 * (Metrowerks, Compaq Tru64)
8468 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008469 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008470 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008471 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008472 * We can achieve the desired consistency by inserting our
8473 * own '0x' or '0X' prefix, and substituting %x/%X in place
8474 * of %#x/%#X.
8475 *
8476 * Note that this is the same approach as used in
8477 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008478 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008479 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8480 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008481 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008482 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008483 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8484 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008485 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008486 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008487 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008488 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008489 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008490 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491}
8492
8493static int
8494formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008495 size_t buflen,
8496 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008498 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008499 if (PyUnicode_Check(v)) {
8500 if (PyUnicode_GET_SIZE(v) != 1)
8501 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 else {
8505 /* Integer input truncated to a character */
8506 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008507 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008509 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008510#ifdef Py_UNICODE_WIDE
8511 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008512 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008513 "%c arg not in range(0x110000) "
8514 "(wide Python build)");
8515 return -1;
8516 }
8517#else
8518 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008519 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008520 "%c arg not in range(0x10000) "
8521 "(narrow Python build)");
8522 return -1;
8523 }
8524#endif
8525 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 }
8527 buf[1] = '\0';
8528 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008529
8530 onError:
8531 PyErr_SetString(PyExc_TypeError,
8532 "%c requires int or char");
8533 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534}
8535
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008536/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8537
8538 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8539 chars are formatted. XXX This is a magic number. Each formatting
8540 routine does bounds checking to ensure no overflow, but a better
8541 solution may be to malloc a buffer of appropriate size for each
8542 format. For now, the current solution is sufficient.
8543*/
8544#define FORMATBUFLEN (size_t)120
8545
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546PyObject *PyUnicode_Format(PyObject *format,
8547 PyObject *args)
8548{
8549 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008550 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 int args_owned = 0;
8552 PyUnicodeObject *result = NULL;
8553 PyObject *dict = NULL;
8554 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008555
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 if (format == NULL || args == NULL) {
8557 PyErr_BadInternalCall();
8558 return NULL;
8559 }
8560 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008561 if (uformat == NULL)
8562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 fmt = PyUnicode_AS_UNICODE(uformat);
8564 fmtcnt = PyUnicode_GET_SIZE(uformat);
8565
8566 reslen = rescnt = fmtcnt + 100;
8567 result = _PyUnicode_New(reslen);
8568 if (result == NULL)
8569 goto onError;
8570 res = PyUnicode_AS_UNICODE(result);
8571
8572 if (PyTuple_Check(args)) {
8573 arglen = PyTuple_Size(args);
8574 argidx = 0;
8575 }
8576 else {
8577 arglen = -1;
8578 argidx = -2;
8579 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008580 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008581 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 dict = args;
8583
8584 while (--fmtcnt >= 0) {
8585 if (*fmt != '%') {
8586 if (--rescnt < 0) {
8587 rescnt = fmtcnt + 100;
8588 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008589 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008590 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8592 --rescnt;
8593 }
8594 *res++ = *fmt++;
8595 }
8596 else {
8597 /* Got a format specifier */
8598 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008599 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 Py_UNICODE c = '\0';
8602 Py_UNICODE fill;
8603 PyObject *v = NULL;
8604 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008605 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008607 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008608 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
8610 fmt++;
8611 if (*fmt == '(') {
8612 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008613 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 PyObject *key;
8615 int pcount = 1;
8616
8617 if (dict == NULL) {
8618 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008619 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 goto onError;
8621 }
8622 ++fmt;
8623 --fmtcnt;
8624 keystart = fmt;
8625 /* Skip over balanced parentheses */
8626 while (pcount > 0 && --fmtcnt >= 0) {
8627 if (*fmt == ')')
8628 --pcount;
8629 else if (*fmt == '(')
8630 ++pcount;
8631 fmt++;
8632 }
8633 keylen = fmt - keystart - 1;
8634 if (fmtcnt < 0 || pcount > 0) {
8635 PyErr_SetString(PyExc_ValueError,
8636 "incomplete format key");
8637 goto onError;
8638 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008639#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008640 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 then looked up since Python uses strings to hold
8642 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008643 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 key = PyUnicode_EncodeUTF8(keystart,
8645 keylen,
8646 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008647#else
8648 key = PyUnicode_FromUnicode(keystart, keylen);
8649#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 if (key == NULL)
8651 goto onError;
8652 if (args_owned) {
8653 Py_DECREF(args);
8654 args_owned = 0;
8655 }
8656 args = PyObject_GetItem(dict, key);
8657 Py_DECREF(key);
8658 if (args == NULL) {
8659 goto onError;
8660 }
8661 args_owned = 1;
8662 arglen = -1;
8663 argidx = -2;
8664 }
8665 while (--fmtcnt >= 0) {
8666 switch (c = *fmt++) {
8667 case '-': flags |= F_LJUST; continue;
8668 case '+': flags |= F_SIGN; continue;
8669 case ' ': flags |= F_BLANK; continue;
8670 case '#': flags |= F_ALT; continue;
8671 case '0': flags |= F_ZERO; continue;
8672 }
8673 break;
8674 }
8675 if (c == '*') {
8676 v = getnextarg(args, arglen, &argidx);
8677 if (v == NULL)
8678 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008679 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 PyErr_SetString(PyExc_TypeError,
8681 "* wants int");
8682 goto onError;
8683 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008684 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008685 if (width == -1 && PyErr_Occurred())
8686 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 if (width < 0) {
8688 flags |= F_LJUST;
8689 width = -width;
8690 }
8691 if (--fmtcnt >= 0)
8692 c = *fmt++;
8693 }
8694 else if (c >= '0' && c <= '9') {
8695 width = c - '0';
8696 while (--fmtcnt >= 0) {
8697 c = *fmt++;
8698 if (c < '0' || c > '9')
8699 break;
8700 if ((width*10) / 10 != width) {
8701 PyErr_SetString(PyExc_ValueError,
8702 "width too big");
8703 goto onError;
8704 }
8705 width = width*10 + (c - '0');
8706 }
8707 }
8708 if (c == '.') {
8709 prec = 0;
8710 if (--fmtcnt >= 0)
8711 c = *fmt++;
8712 if (c == '*') {
8713 v = getnextarg(args, arglen, &argidx);
8714 if (v == NULL)
8715 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008716 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 PyErr_SetString(PyExc_TypeError,
8718 "* wants int");
8719 goto onError;
8720 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008721 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008722 if (prec == -1 && PyErr_Occurred())
8723 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 if (prec < 0)
8725 prec = 0;
8726 if (--fmtcnt >= 0)
8727 c = *fmt++;
8728 }
8729 else if (c >= '0' && c <= '9') {
8730 prec = c - '0';
8731 while (--fmtcnt >= 0) {
8732 c = Py_CHARMASK(*fmt++);
8733 if (c < '0' || c > '9')
8734 break;
8735 if ((prec*10) / 10 != prec) {
8736 PyErr_SetString(PyExc_ValueError,
8737 "prec too big");
8738 goto onError;
8739 }
8740 prec = prec*10 + (c - '0');
8741 }
8742 }
8743 } /* prec */
8744 if (fmtcnt >= 0) {
8745 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 if (--fmtcnt >= 0)
8747 c = *fmt++;
8748 }
8749 }
8750 if (fmtcnt < 0) {
8751 PyErr_SetString(PyExc_ValueError,
8752 "incomplete format");
8753 goto onError;
8754 }
8755 if (c != '%') {
8756 v = getnextarg(args, arglen, &argidx);
8757 if (v == NULL)
8758 goto onError;
8759 }
8760 sign = 0;
8761 fill = ' ';
8762 switch (c) {
8763
8764 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008765 pbuf = formatbuf;
8766 /* presume that buffer length is at least 1 */
8767 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 len = 1;
8769 break;
8770
8771 case 's':
8772 case 'r':
8773 if (PyUnicode_Check(v) && c == 's') {
8774 temp = v;
8775 Py_INCREF(temp);
8776 }
8777 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008779 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 else
8781 temp = PyObject_Repr(v);
8782 if (temp == NULL)
8783 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008784 if (PyUnicode_Check(temp))
8785 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008786 else {
8787 Py_DECREF(temp);
8788 PyErr_SetString(PyExc_TypeError,
8789 "%s argument has non-string str()");
8790 goto onError;
8791 }
8792 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008793 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794 len = PyUnicode_GET_SIZE(temp);
8795 if (prec >= 0 && len > prec)
8796 len = prec;
8797 break;
8798
8799 case 'i':
8800 case 'd':
8801 case 'u':
8802 case 'o':
8803 case 'x':
8804 case 'X':
8805 if (c == 'i')
8806 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008807 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008808 temp = formatlong(v, flags, prec, c);
8809 if (!temp)
8810 goto onError;
8811 pbuf = PyUnicode_AS_UNICODE(temp);
8812 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008813 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008815 else {
8816 pbuf = formatbuf;
8817 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8818 flags, prec, c, v);
8819 if (len < 0)
8820 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008821 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008822 }
8823 if (flags & F_ZERO)
8824 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 break;
8826
8827 case 'e':
8828 case 'E':
8829 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008830 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 case 'g':
8832 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008833 if (c == 'F')
8834 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008835 pbuf = formatbuf;
8836 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8837 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838 if (len < 0)
8839 goto onError;
8840 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008841 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 fill = '0';
8843 break;
8844
8845 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008846 pbuf = formatbuf;
8847 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 if (len < 0)
8849 goto onError;
8850 break;
8851
8852 default:
8853 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008854 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008855 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008856 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008857 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008858 (Py_ssize_t)(fmt - 1 -
8859 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860 goto onError;
8861 }
8862 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008863 if (*pbuf == '-' || *pbuf == '+') {
8864 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 len--;
8866 }
8867 else if (flags & F_SIGN)
8868 sign = '+';
8869 else if (flags & F_BLANK)
8870 sign = ' ';
8871 else
8872 sign = 0;
8873 }
8874 if (width < len)
8875 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008876 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 reslen -= rescnt;
8878 rescnt = width + fmtcnt + 100;
8879 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008880 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008881 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008882 PyErr_NoMemory();
8883 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008884 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008885 if (_PyUnicode_Resize(&result, reslen) < 0) {
8886 Py_XDECREF(temp);
8887 goto onError;
8888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 res = PyUnicode_AS_UNICODE(result)
8890 + reslen - rescnt;
8891 }
8892 if (sign) {
8893 if (fill != ' ')
8894 *res++ = sign;
8895 rescnt--;
8896 if (width > len)
8897 width--;
8898 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008899 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008900 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008901 assert(pbuf[1] == c);
8902 if (fill != ' ') {
8903 *res++ = *pbuf++;
8904 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008905 }
Tim Petersfff53252001-04-12 18:38:48 +00008906 rescnt -= 2;
8907 width -= 2;
8908 if (width < 0)
8909 width = 0;
8910 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 if (width > len && !(flags & F_LJUST)) {
8913 do {
8914 --rescnt;
8915 *res++ = fill;
8916 } while (--width > len);
8917 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008918 if (fill == ' ') {
8919 if (sign)
8920 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008921 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008922 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008923 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008924 *res++ = *pbuf++;
8925 *res++ = *pbuf++;
8926 }
8927 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008928 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929 res += len;
8930 rescnt -= len;
8931 while (--width >= len) {
8932 --rescnt;
8933 *res++ = ' ';
8934 }
8935 if (dict && (argidx < arglen) && c != '%') {
8936 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008937 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008938 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 goto onError;
8940 }
8941 Py_XDECREF(temp);
8942 } /* '%' */
8943 } /* until end */
8944 if (argidx < arglen && !dict) {
8945 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008946 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 goto onError;
8948 }
8949
Thomas Woutersa96affe2006-03-12 00:29:36 +00008950 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8951 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952 if (args_owned) {
8953 Py_DECREF(args);
8954 }
8955 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 return (PyObject *)result;
8957
8958 onError:
8959 Py_XDECREF(result);
8960 Py_DECREF(uformat);
8961 if (args_owned) {
8962 Py_DECREF(args);
8963 }
8964 return NULL;
8965}
8966
Jeremy Hylton938ace62002-07-17 16:30:39 +00008967static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008968unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8969
Tim Peters6d6c1a32001-08-02 04:15:00 +00008970static PyObject *
8971unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8972{
8973 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008974 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008975 char *encoding = NULL;
8976 char *errors = NULL;
8977
Guido van Rossume023fe02001-08-30 03:12:59 +00008978 if (type != &PyUnicode_Type)
8979 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008980 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8981 kwlist, &x, &encoding, &errors))
8982 return NULL;
8983 if (x == NULL)
8984 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008985 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00008986 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008987 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008988 return PyUnicode_FromEncodedObject(x, encoding, errors);
8989}
8990
Guido van Rossume023fe02001-08-30 03:12:59 +00008991static PyObject *
8992unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8993{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008994 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008995 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008996
8997 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8998 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8999 if (tmp == NULL)
9000 return NULL;
9001 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009002 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009003 if (pnew == NULL) {
9004 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009005 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009006 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009007 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
9008 if (pnew->str == NULL) {
9009 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009010 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009011 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009012 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009013 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009014 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9015 pnew->length = n;
9016 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009017 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009018 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009019}
9020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009021PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00009022"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009023\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009024Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009025encoding defaults to the current default string encoding.\n\
9026errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009027
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009028static PyObject *unicode_iter(PyObject *seq);
9029
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009031 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009032 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 sizeof(PyUnicodeObject), /* tp_size */
9034 0, /* tp_itemsize */
9035 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009036 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009038 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009040 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009041 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009042 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009044 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045 (hashfunc) unicode_hash, /* tp_hash*/
9046 0, /* tp_call*/
9047 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009048 PyObject_GenericGetAttr, /* tp_getattro */
9049 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009050 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009051 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9052 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009053 unicode_doc, /* tp_doc */
9054 0, /* tp_traverse */
9055 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009056 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009057 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009058 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009059 0, /* tp_iternext */
9060 unicode_methods, /* tp_methods */
9061 0, /* tp_members */
9062 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009063 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009064 0, /* tp_dict */
9065 0, /* tp_descr_get */
9066 0, /* tp_descr_set */
9067 0, /* tp_dictoffset */
9068 0, /* tp_init */
9069 0, /* tp_alloc */
9070 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009071 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072};
9073
9074/* Initialize the Unicode implementation */
9075
Thomas Wouters78890102000-07-22 19:25:51 +00009076void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009078 int i;
9079
Thomas Wouters477c8d52006-05-27 19:21:47 +00009080 /* XXX - move this array to unicodectype.c ? */
9081 Py_UNICODE linebreak[] = {
9082 0x000A, /* LINE FEED */
9083 0x000D, /* CARRIAGE RETURN */
9084 0x001C, /* FILE SEPARATOR */
9085 0x001D, /* GROUP SEPARATOR */
9086 0x001E, /* RECORD SEPARATOR */
9087 0x0085, /* NEXT LINE */
9088 0x2028, /* LINE SEPARATOR */
9089 0x2029, /* PARAGRAPH SEPARATOR */
9090 };
9091
Fred Drakee4315f52000-05-09 19:53:39 +00009092 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009093 free_list = NULL;
9094 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009096 if (!unicode_empty)
9097 return;
9098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009099 for (i = 0; i < 256; i++)
9100 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009101 if (PyType_Ready(&PyUnicode_Type) < 0)
9102 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009103
9104 /* initialize the linebreak bloom filter */
9105 bloom_linebreak = make_bloom_mask(
9106 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9107 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009108
9109 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110}
9111
9112/* Finalize the Unicode implementation */
9113
Christian Heimesa156e092008-02-16 07:38:31 +00009114int
9115PyUnicode_ClearFreeList(void)
9116{
9117 int freelist_size = numfree;
9118 PyUnicodeObject *u;
9119
9120 for (u = free_list; u != NULL;) {
9121 PyUnicodeObject *v = u;
9122 u = *(PyUnicodeObject **)u;
9123 if (v->str)
9124 PyMem_DEL(v->str);
9125 Py_XDECREF(v->defenc);
9126 PyObject_Del(v);
9127 numfree--;
9128 }
9129 free_list = NULL;
9130 assert(numfree == 0);
9131 return freelist_size;
9132}
9133
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134void
Thomas Wouters78890102000-07-22 19:25:51 +00009135_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009137 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009139 Py_XDECREF(unicode_empty);
9140 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009141
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009142 for (i = 0; i < 256; i++) {
9143 if (unicode_latin1[i]) {
9144 Py_DECREF(unicode_latin1[i]);
9145 unicode_latin1[i] = NULL;
9146 }
9147 }
Christian Heimesa156e092008-02-16 07:38:31 +00009148 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009150
Walter Dörwald16807132007-05-25 13:52:07 +00009151void
9152PyUnicode_InternInPlace(PyObject **p)
9153{
9154 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9155 PyObject *t;
9156 if (s == NULL || !PyUnicode_Check(s))
9157 Py_FatalError(
9158 "PyUnicode_InternInPlace: unicode strings only please!");
9159 /* If it's a subclass, we don't really know what putting
9160 it in the interned dict might do. */
9161 if (!PyUnicode_CheckExact(s))
9162 return;
9163 if (PyUnicode_CHECK_INTERNED(s))
9164 return;
9165 if (interned == NULL) {
9166 interned = PyDict_New();
9167 if (interned == NULL) {
9168 PyErr_Clear(); /* Don't leave an exception */
9169 return;
9170 }
9171 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009172 /* It might be that the GetItem call fails even
9173 though the key is present in the dictionary,
9174 namely when this happens during a stack overflow. */
9175 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009176 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009177 Py_END_ALLOW_RECURSION
9178
Walter Dörwald16807132007-05-25 13:52:07 +00009179 if (t) {
9180 Py_INCREF(t);
9181 Py_DECREF(*p);
9182 *p = t;
9183 return;
9184 }
9185
Martin v. Löwis5b222132007-06-10 09:51:05 +00009186 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009187 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9188 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009189 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009190 return;
9191 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009192 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009193 /* The two references in interned are not counted by refcnt.
9194 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009195 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009196 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9197}
9198
9199void
9200PyUnicode_InternImmortal(PyObject **p)
9201{
9202 PyUnicode_InternInPlace(p);
9203 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9204 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9205 Py_INCREF(*p);
9206 }
9207}
9208
9209PyObject *
9210PyUnicode_InternFromString(const char *cp)
9211{
9212 PyObject *s = PyUnicode_FromString(cp);
9213 if (s == NULL)
9214 return NULL;
9215 PyUnicode_InternInPlace(&s);
9216 return s;
9217}
9218
9219void _Py_ReleaseInternedUnicodeStrings(void)
9220{
9221 PyObject *keys;
9222 PyUnicodeObject *s;
9223 Py_ssize_t i, n;
9224 Py_ssize_t immortal_size = 0, mortal_size = 0;
9225
9226 if (interned == NULL || !PyDict_Check(interned))
9227 return;
9228 keys = PyDict_Keys(interned);
9229 if (keys == NULL || !PyList_Check(keys)) {
9230 PyErr_Clear();
9231 return;
9232 }
9233
9234 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9235 detector, interned unicode strings are not forcibly deallocated;
9236 rather, we give them their stolen references back, and then clear
9237 and DECREF the interned dict. */
9238
9239 n = PyList_GET_SIZE(keys);
9240 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9241 n);
9242 for (i = 0; i < n; i++) {
9243 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9244 switch (s->state) {
9245 case SSTATE_NOT_INTERNED:
9246 /* XXX Shouldn't happen */
9247 break;
9248 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009249 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009250 immortal_size += s->length;
9251 break;
9252 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009253 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009254 mortal_size += s->length;
9255 break;
9256 default:
9257 Py_FatalError("Inconsistent interned string state.");
9258 }
9259 s->state = SSTATE_NOT_INTERNED;
9260 }
9261 fprintf(stderr, "total size of all interned strings: "
9262 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9263 "mortal/immortal\n", mortal_size, immortal_size);
9264 Py_DECREF(keys);
9265 PyDict_Clear(interned);
9266 Py_DECREF(interned);
9267 interned = NULL;
9268}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009269
9270
9271/********************* Unicode Iterator **************************/
9272
9273typedef struct {
9274 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009275 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009276 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9277} unicodeiterobject;
9278
9279static void
9280unicodeiter_dealloc(unicodeiterobject *it)
9281{
9282 _PyObject_GC_UNTRACK(it);
9283 Py_XDECREF(it->it_seq);
9284 PyObject_GC_Del(it);
9285}
9286
9287static int
9288unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9289{
9290 Py_VISIT(it->it_seq);
9291 return 0;
9292}
9293
9294static PyObject *
9295unicodeiter_next(unicodeiterobject *it)
9296{
9297 PyUnicodeObject *seq;
9298 PyObject *item;
9299
9300 assert(it != NULL);
9301 seq = it->it_seq;
9302 if (seq == NULL)
9303 return NULL;
9304 assert(PyUnicode_Check(seq));
9305
9306 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009307 item = PyUnicode_FromUnicode(
9308 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009309 if (item != NULL)
9310 ++it->it_index;
9311 return item;
9312 }
9313
9314 Py_DECREF(seq);
9315 it->it_seq = NULL;
9316 return NULL;
9317}
9318
9319static PyObject *
9320unicodeiter_len(unicodeiterobject *it)
9321{
9322 Py_ssize_t len = 0;
9323 if (it->it_seq)
9324 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009325 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009326}
9327
9328PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9329
9330static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009331 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9332 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009333 {NULL, NULL} /* sentinel */
9334};
9335
9336PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009337 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009338 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009339 sizeof(unicodeiterobject), /* tp_basicsize */
9340 0, /* tp_itemsize */
9341 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009342 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009343 0, /* tp_print */
9344 0, /* tp_getattr */
9345 0, /* tp_setattr */
9346 0, /* tp_compare */
9347 0, /* tp_repr */
9348 0, /* tp_as_number */
9349 0, /* tp_as_sequence */
9350 0, /* tp_as_mapping */
9351 0, /* tp_hash */
9352 0, /* tp_call */
9353 0, /* tp_str */
9354 PyObject_GenericGetAttr, /* tp_getattro */
9355 0, /* tp_setattro */
9356 0, /* tp_as_buffer */
9357 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9358 0, /* tp_doc */
9359 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9360 0, /* tp_clear */
9361 0, /* tp_richcompare */
9362 0, /* tp_weaklistoffset */
9363 PyObject_SelfIter, /* tp_iter */
9364 (iternextfunc)unicodeiter_next, /* tp_iternext */
9365 unicodeiter_methods, /* tp_methods */
9366 0,
9367};
9368
9369static PyObject *
9370unicode_iter(PyObject *seq)
9371{
9372 unicodeiterobject *it;
9373
9374 if (!PyUnicode_Check(seq)) {
9375 PyErr_BadInternalCall();
9376 return NULL;
9377 }
9378 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9379 if (it == NULL)
9380 return NULL;
9381 it->it_index = 0;
9382 Py_INCREF(seq);
9383 it->it_seq = (PyUnicodeObject *)seq;
9384 _PyObject_GC_TRACK(it);
9385 return (PyObject *)it;
9386}
9387
Martin v. Löwis5b222132007-06-10 09:51:05 +00009388size_t
9389Py_UNICODE_strlen(const Py_UNICODE *u)
9390{
9391 int res = 0;
9392 while(*u++)
9393 res++;
9394 return res;
9395}
9396
9397Py_UNICODE*
9398Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9399{
9400 Py_UNICODE *u = s1;
9401 while ((*u++ = *s2++));
9402 return s1;
9403}
9404
9405Py_UNICODE*
9406Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9407{
9408 Py_UNICODE *u = s1;
9409 while ((*u++ = *s2++))
9410 if (n-- == 0)
9411 break;
9412 return s1;
9413}
9414
9415int
9416Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9417{
9418 while (*s1 && *s2 && *s1 == *s2)
9419 s1++, s2++;
9420 if (*s1 && *s2)
9421 return (*s1 < *s2) ? -1 : +1;
9422 if (*s1)
9423 return 1;
9424 if (*s2)
9425 return -1;
9426 return 0;
9427}
9428
9429Py_UNICODE*
9430Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9431{
9432 const Py_UNICODE *p;
9433 for (p = s; *p; p++)
9434 if (*p == c)
9435 return (Py_UNICODE*)p;
9436 return NULL;
9437}
9438
9439
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009440#ifdef __cplusplus
9441}
9442#endif
9443
9444
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009445/*
9446Local variables:
9447c-basic-offset: 4
9448indent-tabs-mode: nil
9449End:
9450*/