blob: 60cbffa59f43cf0d694e91c67d40a064bbca1fb7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
Christian Heimes2202f872008-02-06 14:31:34 +000057#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Christian Heimes2202f872008-02-06 14:31:34 +000065 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000109static PyUnicodeObject *free_list;
110static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes190d79e2008-01-30 11:58:22 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131// case 0x0009: /* HORIZONTAL TABULATION */
132// case 0x000A: /* LINE FEED */
133// case 0x000B: /* VERTICAL TABULATION */
134// case 0x000C: /* FORM FEED */
135// case 0x000D: /* CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138// case 0x001C: /* FILE SEPARATOR */
139// case 0x001D: /* GROUP SEPARATOR */
140// case 0x001E: /* RECORD SEPARATOR */
141// case 0x001F: /* UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143// case 0x0020: /* SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162// 0x000A, /* LINE FEED */
163// 0x000D, /* CARRIAGE RETURN */
164 0, 0, 1, 0, 0, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166// 0x001C, /* FILE SEPARATOR */
167// 0x001D, /* GROUP SEPARATOR */
168// 0x001E, /* RECORD SEPARATOR */
169 0, 0, 0, 0, 1, 1, 1, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0
183};
184
185
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000187PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000189#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190 return 0x10FFFF;
191#else
192 /* This is actually an illegal character, so it should
193 not be passed to unichr. */
194 return 0xFFFF;
195#endif
196}
197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198/* --- Bloom Filters ----------------------------------------------------- */
199
200/* stuff to implement simple "bloom filters" for Unicode characters.
201 to keep things simple, we use a single bitmask, using the least 5
202 bits from each unicode characters as the bit index. */
203
204/* the linebreak mask is set up by Unicode_Init below */
205
206#define BLOOM_MASK unsigned long
207
208static BLOOM_MASK bloom_linebreak;
209
210#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
211
Christian Heimes190d79e2008-01-30 11:58:22 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
220 long mask;
221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
225 mask |= (1 << (ptr[i] & 0x1F));
226
227 return mask;
228}
229
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
238 return 0;
239}
240
241#define BLOOM_MEMBER(mask, chr, set, setlen)\
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 if (unicode == unicode_empty ||
261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000297 Ux0000 terminated; some code (e.g. new_identifier)
298 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299
300 XXX This allocator could further be enhanced by assuring that the
301 free list never reduces its size below 1.
302
303*/
304
305static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000306PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307{
308 register PyUnicodeObject *unicode;
309
Thomas Wouters477c8d52006-05-27 19:21:47 +0000310 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 if (length == 0 && unicode_empty != NULL) {
312 Py_INCREF(unicode_empty);
313 return unicode_empty;
314 }
315
316 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000317 if (free_list) {
318 unicode = free_list;
319 free_list = *(PyUnicodeObject **)unicode;
320 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000322 /* Keep-Alive optimization: we only upsize the buffer,
323 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000324 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000325 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000326 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328 }
329 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000330 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000331 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
332 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 }
334 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 }
336 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000337 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000338 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 if (unicode == NULL)
340 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000341 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
342 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000345 if (!unicode->str) {
346 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000347 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000349 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000350 * the caller fails before initializing str -- unicode_resize()
351 * reads str[0], and the Keep-Alive optimization can keep memory
352 * allocated for str alive across a call to unicode_dealloc(unicode).
353 * We don't want unicode_resize to read uninitialized memory in
354 * that case.
355 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000356 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000360 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000361 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000363
364 onError:
365 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000366 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368}
369
370static
Guido van Rossum9475a232001-10-05 20:51:39 +0000371void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372{
Walter Dörwald16807132007-05-25 13:52:07 +0000373 switch (PyUnicode_CHECK_INTERNED(unicode)) {
374 case SSTATE_NOT_INTERNED:
375 break;
376
377 case SSTATE_INTERNED_MORTAL:
378 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000379 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000380 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
381 Py_FatalError(
382 "deletion of interned unicode string failed");
383 break;
384
385 case SSTATE_INTERNED_IMMORTAL:
386 Py_FatalError("Immortal interned unicode string died.");
387
388 default:
389 Py_FatalError("Inconsistent interned unicode string state.");
390 }
391
Guido van Rossum604ddf82001-12-06 20:03:56 +0000392 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000393 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000394 /* Keep-Alive optimization */
395 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000396 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 unicode->str = NULL;
398 unicode->length = 0;
399 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000400 if (unicode->defenc) {
401 Py_DECREF(unicode->defenc);
402 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000403 }
404 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000405 *(PyUnicodeObject **)unicode = free_list;
406 free_list = unicode;
407 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000410 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000411 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000412 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414}
415
Martin v. Löwis18e16552006-02-15 17:27:45 +0000416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000417{
418 register PyUnicodeObject *v;
419
420 /* Argument checks */
421 if (unicode == NULL) {
422 PyErr_BadInternalCall();
423 return -1;
424 }
425 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000426 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 PyErr_BadInternalCall();
428 return -1;
429 }
430
431 /* Resizing unicode_empty and single character objects is not
432 possible since these are being shared. We simply return a fresh
433 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000434 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 (v == unicode_empty || v->length == 1)) {
436 PyUnicodeObject *w = _PyUnicode_New(length);
437 if (w == NULL)
438 return -1;
439 Py_UNICODE_COPY(w->str, v->str,
440 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000441 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 *unicode = (PyObject *)w;
443 return 0;
444 }
445
446 /* Note that we don't have to modify *unicode for unshared Unicode
447 objects, since we can modify them in-place. */
448 return unicode_resize(v, length);
449}
450
451/* Internal API for use in unicodeobject.c only ! */
452#define _PyUnicode_Resize(unicodevar, length) \
453 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000456 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457{
458 PyUnicodeObject *unicode;
459
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 /* If the Unicode data is known at construction time, we can apply
461 some optimizations which share commonly used objects. */
462 if (u != NULL) {
463
464 /* Optimization for empty strings */
465 if (size == 0 && unicode_empty != NULL) {
466 Py_INCREF(unicode_empty);
467 return (PyObject *)unicode_empty;
468 }
469
470 /* Single character Unicode objects in the Latin-1 range are
471 shared when using this constructor */
472 if (size == 1 && *u < 256) {
473 unicode = unicode_latin1[*u];
474 if (!unicode) {
475 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476 if (!unicode)
477 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000478 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479 unicode_latin1[*u] = unicode;
480 }
481 Py_INCREF(unicode);
482 return (PyObject *)unicode;
483 }
484 }
Tim Petersced69f82003-09-16 20:30:58 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 unicode = _PyUnicode_New(size);
487 if (!unicode)
488 return NULL;
489
490 /* Copy the Unicode data into the new object */
491 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000492 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493
494 return (PyObject *)unicode;
495}
496
Walter Dörwaldd2034312007-05-18 16:29:38 +0000497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000498{
499 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000500 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000501 some optimizations which share commonly used objects.
502 Also, this means the input must be UTF-8, so fall back to the
503 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000504 if (u != NULL) {
505
506 /* Optimization for empty strings */
507 if (size == 0 && unicode_empty != NULL) {
508 Py_INCREF(unicode_empty);
509 return (PyObject *)unicode_empty;
510 }
511
Martin v. Löwis9c121062007-08-05 20:26:11 +0000512 /* Single characters are shared when using this constructor.
513 Restrict to ASCII, since the input must be UTF-8. */
514 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000515 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000516 if (!unicode) {
517 unicode = _PyUnicode_New(1);
518 if (!unicode)
519 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000520 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000521 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000522 }
523 Py_INCREF(unicode);
524 return (PyObject *)unicode;
525 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000526
527 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 }
529
Walter Dörwald55507312007-05-18 13:12:10 +0000530 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (!unicode)
532 return NULL;
533
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 return (PyObject *)unicode;
535}
536
Walter Dörwaldd2034312007-05-18 16:29:38 +0000537PyObject *PyUnicode_FromString(const char *u)
538{
539 size_t size = strlen(u);
540 if (size > PY_SSIZE_T_MAX) {
541 PyErr_SetString(PyExc_OverflowError, "input too long");
542 return NULL;
543 }
544
545 return PyUnicode_FromStringAndSize(u, size);
546}
547
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548#ifdef HAVE_WCHAR_H
549
550PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000551 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552{
553 PyUnicodeObject *unicode;
554
555 if (w == NULL) {
556 PyErr_BadInternalCall();
557 return NULL;
558 }
559
560 unicode = _PyUnicode_New(size);
561 if (!unicode)
562 return NULL;
563
564 /* Copy the wchar_t data into the new object */
565#ifdef HAVE_USABLE_WCHAR_T
566 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000567#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000568 {
569 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000570 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000572 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 *u++ = *w++;
574 }
575#endif
576
577 return (PyObject *)unicode;
578}
579
Walter Dörwald346737f2007-05-31 10:44:43 +0000580static void
581makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
582{
583 *fmt++ = '%';
584 if (width) {
585 if (zeropad)
586 *fmt++ = '0';
587 fmt += sprintf(fmt, "%d", width);
588 }
589 if (precision)
590 fmt += sprintf(fmt, ".%d", precision);
591 if (longflag)
592 *fmt++ = 'l';
593 else if (size_tflag) {
594 char *f = PY_FORMAT_SIZE_T;
595 while (*f)
596 *fmt++ = *f++;
597 }
598 *fmt++ = c;
599 *fmt = '\0';
600}
601
Walter Dörwaldd2034312007-05-18 16:29:38 +0000602#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
603
604PyObject *
605PyUnicode_FromFormatV(const char *format, va_list vargs)
606{
607 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000608 Py_ssize_t callcount = 0;
609 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000610 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000611 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000612 int width = 0;
613 int precision = 0;
614 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000615 const char* f;
616 Py_UNICODE *s;
617 PyObject *string;
618 /* used by sprintf */
619 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000620 /* use abuffer instead of buffer, if we need more space
621 * (which can happen if there's a format specifier with width). */
622 char *abuffer = NULL;
623 char *realbuffer;
624 Py_ssize_t abuffersize = 0;
625 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000626 const char *copy;
627
628#ifdef VA_LIST_IS_ARRAY
629 Py_MEMCPY(count, vargs, sizeof(va_list));
630#else
631#ifdef __va_copy
632 __va_copy(count, vargs);
633#else
634 count = vargs;
635#endif
636#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000637 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000638 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000639 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000640 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000641 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000642 ++callcount;
643 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000644 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000645 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000646 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000647 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000648 if (!callresults) {
649 PyErr_NoMemory();
650 return NULL;
651 }
652 callresult = callresults;
653 }
654 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000655 for (f = format; *f; f++) {
656 if (*f == '%') {
657 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000658 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000659 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000660 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000661 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662 ;
663
664 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
665 * they don't affect the amount of space we reserve.
666 */
667 if ((*f == 'l' || *f == 'z') &&
668 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000669 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000670
671 switch (*f) {
672 case 'c':
673 (void)va_arg(count, int);
674 /* fall through... */
675 case '%':
676 n++;
677 break;
678 case 'd': case 'u': case 'i': case 'x':
679 (void) va_arg(count, int);
680 /* 20 bytes is enough to hold a 64-bit
681 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000682 This isn't enough for octal.
683 If a width is specified we need more
684 (which we allocate later). */
685 if (width < 20)
686 width = 20;
687 n += width;
688 if (abuffersize < width)
689 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000690 break;
691 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000692 {
693 /* UTF-8 */
694 unsigned char*s;
695 s = va_arg(count, unsigned char*);
696 while (*s) {
697 if (*s < 128) {
698 n++; s++;
699 } else if (*s < 0xc0) {
700 /* invalid UTF-8 */
701 n++; s++;
702 } else if (*s < 0xc0) {
703 n++;
704 s++; if(!*s)break;
705 s++;
706 } else if (*s < 0xe0) {
707 n++;
708 s++; if(!*s)break;
709 s++; if(!*s)break;
710 s++;
711 } else {
712 #ifdef Py_UNICODE_WIDE
713 n++;
714 #else
715 n+=2;
716 #endif
717 s++; if(!*s)break;
718 s++; if(!*s)break;
719 s++; if(!*s)break;
720 s++;
721 }
722 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000724 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725 case 'U':
726 {
727 PyObject *obj = va_arg(count, PyObject *);
728 assert(obj && PyUnicode_Check(obj));
729 n += PyUnicode_GET_SIZE(obj);
730 break;
731 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000732 case 'V':
733 {
734 PyObject *obj = va_arg(count, PyObject *);
735 const char *str = va_arg(count, const char *);
736 assert(obj || str);
737 assert(!obj || PyUnicode_Check(obj));
738 if (obj)
739 n += PyUnicode_GET_SIZE(obj);
740 else
741 n += strlen(str);
742 break;
743 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000744 case 'S':
745 {
746 PyObject *obj = va_arg(count, PyObject *);
747 PyObject *str;
748 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000749 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000750 if (!str)
751 goto fail;
752 n += PyUnicode_GET_SIZE(str);
753 /* Remember the str and switch to the next slot */
754 *callresult++ = str;
755 break;
756 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000757 case 'R':
758 {
759 PyObject *obj = va_arg(count, PyObject *);
760 PyObject *repr;
761 assert(obj);
762 repr = PyObject_Repr(obj);
763 if (!repr)
764 goto fail;
765 n += PyUnicode_GET_SIZE(repr);
766 /* Remember the repr and switch to the next slot */
767 *callresult++ = repr;
768 break;
769 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770 case 'p':
771 (void) va_arg(count, int);
772 /* maximum 64-bit pointer representation:
773 * 0xffffffffffffffff
774 * so 19 characters is enough.
775 * XXX I count 18 -- what's the extra for?
776 */
777 n += 19;
778 break;
779 default:
780 /* if we stumble upon an unknown
781 formatting code, copy the rest of
782 the format string to the output
783 string. (we cannot just skip the
784 code, since there's no way to know
785 what's in the argument list) */
786 n += strlen(p);
787 goto expand;
788 }
789 } else
790 n++;
791 }
792 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000793 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000794 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000795 if (!abuffer) {
796 PyErr_NoMemory();
797 goto fail;
798 }
799 realbuffer = abuffer;
800 }
801 else
802 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000803 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000804 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000805 we don't have to resize the string.
806 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 string = PyUnicode_FromUnicode(NULL, n);
808 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000809 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000810
811 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
814 for (f = format; *f; f++) {
815 if (*f == '%') {
816 const char* p = f++;
817 int longflag = 0;
818 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000819 zeropad = (*f == '0');
820 /* parse the width.precision part */
821 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000822 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000823 width = (width*10) + *f++ - '0';
824 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000825 if (*f == '.') {
826 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000827 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000828 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000829 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000830 /* handle the long flag, but only for %ld and %lu.
831 others can be added when necessary. */
832 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
833 longflag = 1;
834 ++f;
835 }
836 /* handle the size_t flag. */
837 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
838 size_tflag = 1;
839 ++f;
840 }
841
842 switch (*f) {
843 case 'c':
844 *s++ = va_arg(vargs, int);
845 break;
846 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000847 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000849 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000850 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000851 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000852 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 sprintf(realbuffer, fmt, va_arg(vargs, int));
854 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855 break;
856 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000858 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000860 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000861 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000863 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
864 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000865 break;
866 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000867 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
868 sprintf(realbuffer, fmt, va_arg(vargs, int));
869 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000870 break;
871 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000872 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
873 sprintf(realbuffer, fmt, va_arg(vargs, int));
874 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000875 break;
876 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000877 {
878 /* Parameter must be UTF-8 encoded.
879 In case of encoding errors, use
880 the replacement character. */
881 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000883 u = PyUnicode_DecodeUTF8(p, strlen(p),
884 "replace");
885 if (!u)
886 goto fail;
887 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
888 PyUnicode_GET_SIZE(u));
889 s += PyUnicode_GET_SIZE(u);
890 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000891 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000892 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000893 case 'U':
894 {
895 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000896 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
897 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
898 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000899 break;
900 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000901 case 'V':
902 {
903 PyObject *obj = va_arg(vargs, PyObject *);
904 const char *str = va_arg(vargs, const char *);
905 if (obj) {
906 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
907 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
908 s += size;
909 } else {
910 appendstring(str);
911 }
912 break;
913 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000914 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000915 case 'R':
916 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000917 Py_UNICODE *ucopy;
918 Py_ssize_t usize;
919 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000920 /* unused, since we already have the result */
921 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000922 ucopy = PyUnicode_AS_UNICODE(*callresult);
923 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000924 for (upos = 0; upos<usize;)
925 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000926 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000927 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000928 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000929 ++callresult;
930 break;
931 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000932 case 'p':
933 sprintf(buffer, "%p", va_arg(vargs, void*));
934 /* %p is ill-defined: ensure leading 0x. */
935 if (buffer[1] == 'X')
936 buffer[1] = 'x';
937 else if (buffer[1] != 'x') {
938 memmove(buffer+2, buffer, strlen(buffer)+1);
939 buffer[0] = '0';
940 buffer[1] = 'x';
941 }
942 appendstring(buffer);
943 break;
944 case '%':
945 *s++ = '%';
946 break;
947 default:
948 appendstring(p);
949 goto end;
950 }
951 } else
952 *s++ = *f;
953 }
954
955 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000956 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000957 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000958 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000959 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000960 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
961 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000962 fail:
963 if (callresults) {
964 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000965 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000966 Py_DECREF(*callresult2);
967 ++callresult2;
968 }
Christian Heimesb186d002008-03-18 15:15:01 +0000969 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000970 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000971 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000972 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000973 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000974}
975
976#undef appendstring
977
978PyObject *
979PyUnicode_FromFormat(const char *format, ...)
980{
981 PyObject* ret;
982 va_list vargs;
983
984#ifdef HAVE_STDARG_PROTOTYPES
985 va_start(vargs, format);
986#else
987 va_start(vargs);
988#endif
989 ret = PyUnicode_FromFormatV(format, vargs);
990 va_end(vargs);
991 return ret;
992}
993
Martin v. Löwis18e16552006-02-15 17:27:45 +0000994Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
995 wchar_t *w,
996 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000997{
998 if (unicode == NULL) {
999 PyErr_BadInternalCall();
1000 return -1;
1001 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001002
1003 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001004 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001005 size = PyUnicode_GET_SIZE(unicode) + 1;
1006
Guido van Rossumd57fd912000-03-10 22:53:23 +00001007#ifdef HAVE_USABLE_WCHAR_T
1008 memcpy(w, unicode->str, size * sizeof(wchar_t));
1009#else
1010 {
1011 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001012 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001014 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 *w++ = *u++;
1016 }
1017#endif
1018
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001019 if (size > PyUnicode_GET_SIZE(unicode))
1020 return PyUnicode_GET_SIZE(unicode);
1021 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return size;
1023}
1024
1025#endif
1026
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001027PyObject *PyUnicode_FromOrdinal(int ordinal)
1028{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001029 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001030
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001031 if (ordinal < 0 || ordinal > 0x10ffff) {
1032 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001033 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001034 return NULL;
1035 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001036
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 s[0] = 0xD800 | (ordinal >> 10);
1041 s[1] = 0xDC00 | (ordinal & 0x3FF);
1042 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001043 }
1044#endif
1045
Hye-Shik Chang40574832004-04-06 07:24:51 +00001046 s[0] = (Py_UNICODE)ordinal;
1047 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001048}
1049
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050PyObject *PyUnicode_FromObject(register PyObject *obj)
1051{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001052 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001053 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001054 if (PyUnicode_CheckExact(obj)) {
1055 Py_INCREF(obj);
1056 return obj;
1057 }
1058 if (PyUnicode_Check(obj)) {
1059 /* For a Unicode subtype that's not a Unicode object,
1060 return a true Unicode object with the same data. */
1061 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1062 PyUnicode_GET_SIZE(obj));
1063 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001064 PyErr_Format(PyExc_TypeError,
1065 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001066 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001067 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001068}
1069
1070PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1071 const char *encoding,
1072 const char *errors)
1073{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001074 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001075 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001076 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001077
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 if (obj == NULL) {
1079 PyErr_BadInternalCall();
1080 return NULL;
1081 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001082
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001083 if (PyUnicode_Check(obj)) {
1084 PyErr_SetString(PyExc_TypeError,
1085 "decoding Unicode is not supported");
1086 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001087 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088
1089 /* Coerce object */
1090 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001091 s = PyString_AS_STRING(obj);
1092 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001093 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001094 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1095 /* Overwrite the error message with something more useful in
1096 case of a TypeError. */
1097 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001098 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 "coercing to Unicode: need string or buffer, "
1100 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001101 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001102 goto onError;
1103 }
Tim Petersced69f82003-09-16 20:30:58 +00001104
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001105 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106 if (len == 0) {
1107 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109 }
Tim Petersced69f82003-09-16 20:30:58 +00001110 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001112
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001113 return v;
1114
1115 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117}
1118
1119PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001120 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 const char *encoding,
1122 const char *errors)
1123{
1124 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001125 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001126 char lower[20]; /* Enough for any encoding name we recognize */
1127 char *l;
1128 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001129
1130 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001131 encoding = PyUnicode_GetDefaultEncoding();
1132
1133 /* Convert encoding to lower case and replace '_' with '-' in order to
1134 catch e.g. UTF_8 */
1135 e = encoding;
1136 l = lower;
1137 while (*e && l < &lower[(sizeof lower) - 2]) {
1138 if (ISUPPER(*e)) {
1139 *l++ = TOLOWER(*e++);
1140 }
1141 else if (*e == '_') {
1142 *l++ = '-';
1143 e++;
1144 }
1145 else {
1146 *l++ = *e++;
1147 }
1148 }
1149 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001150
1151 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001152 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001154 else if ((strcmp(lower, "latin-1") == 0) ||
1155 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001156 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001157#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001158 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001159 return PyUnicode_DecodeMBCS(s, size, errors);
1160#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001161 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001162 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001163 else if (strcmp(lower, "utf-16") == 0)
1164 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1165 else if (strcmp(lower, "utf-32") == 0)
1166 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167
1168 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001169 buffer = NULL;
1170 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1171 goto onError;
1172 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 if (buffer == NULL)
1174 goto onError;
1175 unicode = PyCodec_Decode(buffer, encoding, errors);
1176 if (unicode == NULL)
1177 goto onError;
1178 if (!PyUnicode_Check(unicode)) {
1179 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001180 "decoder did not return an unicode object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001181 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 Py_DECREF(unicode);
1183 goto onError;
1184 }
1185 Py_DECREF(buffer);
1186 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001187
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 onError:
1189 Py_XDECREF(buffer);
1190 return NULL;
1191}
1192
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001193PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1194 const char *encoding,
1195 const char *errors)
1196{
1197 PyObject *v;
1198
1199 if (!PyUnicode_Check(unicode)) {
1200 PyErr_BadArgument();
1201 goto onError;
1202 }
1203
1204 if (encoding == NULL)
1205 encoding = PyUnicode_GetDefaultEncoding();
1206
1207 /* Decode via the codec registry */
1208 v = PyCodec_Decode(unicode, encoding, errors);
1209 if (v == NULL)
1210 goto onError;
1211 return v;
1212
1213 onError:
1214 return NULL;
1215}
1216
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001218 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *encoding,
1220 const char *errors)
1221{
1222 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 unicode = PyUnicode_FromUnicode(s, size);
1225 if (unicode == NULL)
1226 return NULL;
1227 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1228 Py_DECREF(unicode);
1229 return v;
1230}
1231
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001232PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1233 const char *encoding,
1234 const char *errors)
1235{
1236 PyObject *v;
1237
1238 if (!PyUnicode_Check(unicode)) {
1239 PyErr_BadArgument();
1240 goto onError;
1241 }
1242
1243 if (encoding == NULL)
1244 encoding = PyUnicode_GetDefaultEncoding();
1245
1246 /* Encode via the codec registry */
1247 v = PyCodec_Encode(unicode, encoding, errors);
1248 if (v == NULL)
1249 goto onError;
1250 return v;
1251
1252 onError:
1253 return NULL;
1254}
1255
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1257 const char *encoding,
1258 const char *errors)
1259{
1260 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001261
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 if (!PyUnicode_Check(unicode)) {
1263 PyErr_BadArgument();
1264 goto onError;
1265 }
Fred Drakee4315f52000-05-09 19:53:39 +00001266
Tim Petersced69f82003-09-16 20:30:58 +00001267 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001268 encoding = PyUnicode_GetDefaultEncoding();
1269
1270 /* Shortcuts for common default encodings */
1271 if (errors == NULL) {
1272 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001273 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001274 else if (strcmp(encoding, "latin-1") == 0)
1275 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001276#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1277 else if (strcmp(encoding, "mbcs") == 0)
1278 return PyUnicode_AsMBCSString(unicode);
1279#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001280 else if (strcmp(encoding, "ascii") == 0)
1281 return PyUnicode_AsASCIIString(unicode);
1282 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283
1284 /* Encode via the codec registry */
1285 v = PyCodec_Encode(unicode, encoding, errors);
1286 if (v == NULL)
1287 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001288 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 onError:
1292 return NULL;
1293}
1294
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001295PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1296 const char *errors)
1297{
1298 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001299 if (v)
1300 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001301 if (errors != NULL)
1302 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001303 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001304 PyUnicode_GET_SIZE(unicode),
1305 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001306 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001307 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001308 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001309 return v;
1310}
1311
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001312PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001313PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001314 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001315 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1316}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001317
Christian Heimes5894ba72007-11-04 11:43:14 +00001318PyObject*
1319PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1320{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001321 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1322 can be undefined. If it is case, decode using UTF-8. The following assumes
1323 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1324 bootstrapping process where the codecs aren't ready yet.
1325 */
1326 if (Py_FileSystemDefaultEncoding) {
1327#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001328 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001329 return PyUnicode_DecodeMBCS(s, size, "replace");
1330 }
1331#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001332 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001333 return PyUnicode_DecodeUTF8(s, size, "replace");
1334 }
1335#endif
1336 return PyUnicode_Decode(s, size,
1337 Py_FileSystemDefaultEncoding,
1338 "replace");
1339 }
1340 else {
1341 return PyUnicode_DecodeUTF8(s, size, "replace");
1342 }
1343}
1344
Martin v. Löwis5b222132007-06-10 09:51:05 +00001345char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001346PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001347{
Christian Heimesf3863112007-11-22 07:46:41 +00001348 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001349 if (!PyUnicode_Check(unicode)) {
1350 PyErr_BadArgument();
1351 return NULL;
1352 }
Christian Heimesf3863112007-11-22 07:46:41 +00001353 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1354 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001355 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001356 if (psize != NULL)
Christian Heimesf3863112007-11-22 07:46:41 +00001357 *psize = PyString_GET_SIZE(bytes);
1358 return PyString_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001359}
1360
1361char*
1362PyUnicode_AsString(PyObject *unicode)
1363{
1364 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001365}
1366
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1368{
1369 if (!PyUnicode_Check(unicode)) {
1370 PyErr_BadArgument();
1371 goto onError;
1372 }
1373 return PyUnicode_AS_UNICODE(unicode);
1374
1375 onError:
1376 return NULL;
1377}
1378
Martin v. Löwis18e16552006-02-15 17:27:45 +00001379Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380{
1381 if (!PyUnicode_Check(unicode)) {
1382 PyErr_BadArgument();
1383 goto onError;
1384 }
1385 return PyUnicode_GET_SIZE(unicode);
1386
1387 onError:
1388 return -1;
1389}
1390
Thomas Wouters78890102000-07-22 19:25:51 +00001391const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001392{
1393 return unicode_default_encoding;
1394}
1395
1396int PyUnicode_SetDefaultEncoding(const char *encoding)
1397{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001398 if (strcmp(encoding, unicode_default_encoding) != 0) {
1399 PyErr_Format(PyExc_ValueError,
1400 "Can only set default encoding to %s",
1401 unicode_default_encoding);
1402 return -1;
1403 }
Fred Drakee4315f52000-05-09 19:53:39 +00001404 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001405}
1406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001407/* error handling callback helper:
1408 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001409 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410 and adjust various state variables.
1411 return 0 on success, -1 on error
1412*/
1413
1414static
1415int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1416 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001417 const char **input, const char **inend, Py_ssize_t *startinpos,
1418 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001419 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422
1423 PyObject *restuple = NULL;
1424 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001425 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001426 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001427 Py_ssize_t requiredsize;
1428 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001430 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001431 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001432 int res = -1;
1433
1434 if (*errorHandler == NULL) {
1435 *errorHandler = PyCodec_LookupError(errors);
1436 if (*errorHandler == NULL)
1437 goto onError;
1438 }
1439
1440 if (*exceptionObject == NULL) {
1441 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001442 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 if (*exceptionObject == NULL)
1444 goto onError;
1445 }
1446 else {
1447 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1448 goto onError;
1449 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1450 goto onError;
1451 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1452 goto onError;
1453 }
1454
1455 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1456 if (restuple == NULL)
1457 goto onError;
1458 if (!PyTuple_Check(restuple)) {
1459 PyErr_Format(PyExc_TypeError, &argparse[4]);
1460 goto onError;
1461 }
1462 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1463 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001464
1465 /* Copy back the bytes variables, which might have been modified by the
1466 callback */
1467 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1468 if (!inputobj)
1469 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001470 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001471 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1472 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001473 *input = PyString_AS_STRING(inputobj);
1474 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001475 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001476 /* we can DECREF safely, as the exception has another reference,
1477 so the object won't go away. */
1478 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001479
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001480 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001481 newpos = insize+newpos;
1482 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001483 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001484 goto onError;
1485 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001486
1487 /* need more space? (at least enough for what we
1488 have+the replacement+the rest of the string (starting
1489 at the new input position), so we won't have to check space
1490 when there are no errors in the rest of the string) */
1491 repptr = PyUnicode_AS_UNICODE(repunicode);
1492 repsize = PyUnicode_GET_SIZE(repunicode);
1493 requiredsize = *outpos + repsize + insize-newpos;
1494 if (requiredsize > outsize) {
1495 if (requiredsize<2*outsize)
1496 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001497 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 goto onError;
1499 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1500 }
1501 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001502 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 Py_UNICODE_COPY(*outptr, repptr, repsize);
1504 *outptr += repsize;
1505 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 /* we made it! */
1508 res = 0;
1509
1510 onError:
1511 Py_XDECREF(restuple);
1512 return res;
1513}
1514
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001515/* --- UTF-7 Codec -------------------------------------------------------- */
1516
1517/* see RFC2152 for details */
1518
Tim Petersced69f82003-09-16 20:30:58 +00001519static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520char utf7_special[128] = {
1521 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1522 encoded:
1523 0 - not special
1524 1 - special
1525 2 - whitespace (optional)
1526 3 - RFC2152 Set O (optional) */
1527 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1528 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1529 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1531 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1533 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1534 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1535
1536};
1537
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001538/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1539 warnings about the comparison always being false; since
1540 utf7_special[0] is 1, we can safely make that one comparison
1541 true */
1542
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001544 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001545 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 (encodeO && (utf7_special[(c)] == 3)))
1547
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001548#define B64(n) \
1549 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1550#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001551 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001552#define UB64(c) \
1553 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1554 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001555
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001556#define ENCODE(out, ch, bits) \
1557 while (bits >= 6) { \
1558 *out++ = B64(ch >> (bits-6)); \
1559 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560 }
1561
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001562#define DECODE(out, ch, bits, surrogate) \
1563 while (bits >= 16) { \
1564 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1565 bits -= 16; \
1566 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001567 /* We have already generated an error for the high surrogate \
1568 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001569 surrogate = 0; \
1570 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001571 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001572 it in a 16-bit character */ \
1573 surrogate = 1; \
1574 errmsg = "code pairs are not supported"; \
1575 goto utf7Error; \
1576 } else { \
1577 *out++ = outCh; \
1578 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001579 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001582 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 const char *errors)
1584{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001585 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1586}
1587
1588PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1589 Py_ssize_t size,
1590 const char *errors,
1591 Py_ssize_t *consumed)
1592{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001593 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001594 Py_ssize_t startinpos;
1595 Py_ssize_t endinpos;
1596 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 const char *e;
1598 PyUnicodeObject *unicode;
1599 Py_UNICODE *p;
1600 const char *errmsg = "";
1601 int inShift = 0;
1602 unsigned int bitsleft = 0;
1603 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 int surrogate = 0;
1605 PyObject *errorHandler = NULL;
1606 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607
1608 unicode = _PyUnicode_New(size);
1609 if (!unicode)
1610 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001611 if (size == 0) {
1612 if (consumed)
1613 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001614 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001615 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001616
1617 p = unicode->str;
1618 e = s + size;
1619
1620 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 Py_UNICODE ch;
1622 restart:
1623 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624
1625 if (inShift) {
1626 if ((ch == '-') || !B64CHAR(ch)) {
1627 inShift = 0;
1628 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001629
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1631 if (bitsleft >= 6) {
1632 /* The shift sequence has a partial character in it. If
1633 bitsleft < 6 then we could just classify it as padding
1634 but that is not the case here */
1635
1636 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001637 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 }
1639 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001640 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 here so indicate the potential of a misencoded character. */
1642
1643 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1644 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1645 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001646 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 }
1648
1649 if (ch == '-') {
1650 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001651 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 inShift = 1;
1653 }
1654 } else if (SPECIAL(ch,0,0)) {
1655 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001656 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 } else {
1658 *p++ = ch;
1659 }
1660 } else {
1661 charsleft = (charsleft << 6) | UB64(ch);
1662 bitsleft += 6;
1663 s++;
1664 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1665 }
1666 }
1667 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001668 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669 s++;
1670 if (s < e && *s == '-') {
1671 s++;
1672 *p++ = '+';
1673 } else
1674 {
1675 inShift = 1;
1676 bitsleft = 0;
1677 }
1678 }
1679 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001680 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 errmsg = "unexpected special character";
1682 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001683 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001684 }
1685 else {
1686 *p++ = ch;
1687 s++;
1688 }
1689 continue;
1690 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001691 outpos = p-PyUnicode_AS_UNICODE(unicode);
1692 endinpos = s-starts;
1693 if (unicode_decode_call_errorhandler(
1694 errors, &errorHandler,
1695 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001696 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001697 (PyObject **)&unicode, &outpos, &p))
1698 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001699 }
1700
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001701 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 outpos = p-PyUnicode_AS_UNICODE(unicode);
1703 endinpos = size;
1704 if (unicode_decode_call_errorhandler(
1705 errors, &errorHandler,
1706 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001707 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001708 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001709 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001710 if (s < e)
1711 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001712 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001713 if (consumed) {
1714 if(inShift)
1715 *consumed = startinpos;
1716 else
1717 *consumed = s-starts;
1718 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001719
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001720 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001721 goto onError;
1722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 Py_XDECREF(errorHandler);
1724 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725 return (PyObject *)unicode;
1726
1727onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 Py_XDECREF(errorHandler);
1729 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 Py_DECREF(unicode);
1731 return NULL;
1732}
1733
1734
1735PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001736 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001737 int encodeSetO,
1738 int encodeWhiteSpace,
1739 const char *errors)
1740{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001741 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001743 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001745 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001746 unsigned int bitsleft = 0;
1747 unsigned long charsleft = 0;
1748 char * out;
1749 char * start;
1750
1751 if (size == 0)
Christian Heimesf3863112007-11-22 07:46:41 +00001752 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753
Walter Dörwald51ab4142007-05-05 14:43:36 +00001754 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 if (v == NULL)
1756 return NULL;
1757
Walter Dörwald51ab4142007-05-05 14:43:36 +00001758 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759 for (;i < size; ++i) {
1760 Py_UNICODE ch = s[i];
1761
1762 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001763 if (ch == '+') {
1764 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765 *out++ = '-';
1766 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1767 charsleft = ch;
1768 bitsleft = 16;
1769 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001770 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001771 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001772 } else {
1773 *out++ = (char) ch;
1774 }
1775 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1777 *out++ = B64(charsleft << (6-bitsleft));
1778 charsleft = 0;
1779 bitsleft = 0;
1780 /* Characters not in the BASE64 set implicitly unshift the sequence
1781 so no '-' is required, except if the character is itself a '-' */
1782 if (B64CHAR(ch) || ch == '-') {
1783 *out++ = '-';
1784 }
1785 inShift = 0;
1786 *out++ = (char) ch;
1787 } else {
1788 bitsleft += 16;
1789 charsleft = (charsleft << 16) | ch;
1790 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1791
1792 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001793 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001794 or '-' then the shift sequence will be terminated implicitly and we
1795 don't have to insert a '-'. */
1796
1797 if (bitsleft == 0) {
1798 if (i + 1 < size) {
1799 Py_UNICODE ch2 = s[i+1];
1800
1801 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001802
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 } else if (B64CHAR(ch2) || ch2 == '-') {
1804 *out++ = '-';
1805 inShift = 0;
1806 } else {
1807 inShift = 0;
1808 }
1809
1810 }
1811 else {
1812 *out++ = '-';
1813 inShift = 0;
1814 }
1815 }
Tim Petersced69f82003-09-16 20:30:58 +00001816 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001818 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 if (bitsleft) {
1820 *out++= B64(charsleft << (6-bitsleft) );
1821 *out++ = '-';
1822 }
1823
Guido van Rossum98297ee2007-11-06 21:34:58 +00001824 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1825 Py_DECREF(v);
1826 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001827}
1828
1829#undef SPECIAL
1830#undef B64
1831#undef B64CHAR
1832#undef UB64
1833#undef ENCODE
1834#undef DECODE
1835
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836/* --- UTF-8 Codec -------------------------------------------------------- */
1837
Tim Petersced69f82003-09-16 20:30:58 +00001838static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839char utf8_code_length[256] = {
1840 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1841 illegal prefix. see RFC 2279 for details */
1842 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1844 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1846 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1847 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1848 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1849 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1854 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1855 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1856 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1857 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1858};
1859
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001861 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 const char *errors)
1863{
Walter Dörwald69652032004-09-07 20:24:22 +00001864 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1865}
1866
1867PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001868 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001869 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001870 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001871{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001874 Py_ssize_t startinpos;
1875 Py_ssize_t endinpos;
1876 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 const char *e;
1878 PyUnicodeObject *unicode;
1879 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 PyObject *errorHandler = NULL;
1882 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883
1884 /* Note: size will always be longer than the resulting Unicode
1885 character count */
1886 unicode = _PyUnicode_New(size);
1887 if (!unicode)
1888 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001889 if (size == 0) {
1890 if (consumed)
1891 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001893 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894
1895 /* Unpack UTF-8 encoded data */
1896 p = unicode->str;
1897 e = s + size;
1898
1899 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001900 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901
1902 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001903 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 s++;
1905 continue;
1906 }
1907
1908 n = utf8_code_length[ch];
1909
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001910 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001911 if (consumed)
1912 break;
1913 else {
1914 errmsg = "unexpected end of data";
1915 startinpos = s-starts;
1916 endinpos = size;
1917 goto utf8Error;
1918 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920
1921 switch (n) {
1922
1923 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001924 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001925 startinpos = s-starts;
1926 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001927 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928
1929 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001930 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001931 startinpos = s-starts;
1932 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934
1935 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 if ((s[1] & 0xc0) != 0x80) {
1937 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 startinpos = s-starts;
1939 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001940 goto utf8Error;
1941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001943 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001944 startinpos = s-starts;
1945 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001946 errmsg = "illegal encoding";
1947 goto utf8Error;
1948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001950 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 break;
1952
1953 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001954 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001955 (s[2] & 0xc0) != 0x80) {
1956 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001957 startinpos = s-starts;
1958 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 goto utf8Error;
1960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001962 if (ch < 0x0800) {
1963 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001964 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001965
1966 XXX For wide builds (UCS-4) we should probably try
1967 to recombine the surrogates into a single code
1968 unit.
1969 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001970 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 startinpos = s-starts;
1972 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001973 goto utf8Error;
1974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001976 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001977 break;
1978
1979 case 4:
1980 if ((s[1] & 0xc0) != 0x80 ||
1981 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001982 (s[3] & 0xc0) != 0x80) {
1983 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001984 startinpos = s-starts;
1985 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001986 goto utf8Error;
1987 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001988 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1989 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1990 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001991 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001992 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001993 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001994 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001995 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001996 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001997 startinpos = s-starts;
1998 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001999 goto utf8Error;
2000 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002001#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002002 *p++ = (Py_UNICODE)ch;
2003#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002004 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002005
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002006 /* translate from 10000..10FFFF to 0..FFFF */
2007 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002008
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 /* high surrogate = top 10 bits added to D800 */
2010 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002011
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002012 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002013 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002014#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 break;
2016
2017 default:
2018 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002019 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 startinpos = s-starts;
2021 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002022 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 }
2024 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002025 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002026
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002027 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 outpos = p-PyUnicode_AS_UNICODE(unicode);
2029 if (unicode_decode_call_errorhandler(
2030 errors, &errorHandler,
2031 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002032 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002033 (PyObject **)&unicode, &outpos, &p))
2034 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 }
Walter Dörwald69652032004-09-07 20:24:22 +00002036 if (consumed)
2037 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038
2039 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002040 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 goto onError;
2042
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043 Py_XDECREF(errorHandler);
2044 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 return (PyObject *)unicode;
2046
2047onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 Py_XDECREF(errorHandler);
2049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 Py_DECREF(unicode);
2051 return NULL;
2052}
2053
Tim Peters602f7402002-04-27 18:03:26 +00002054/* Allocation strategy: if the string is short, convert into a stack buffer
2055 and allocate exactly as much space needed at the end. Else allocate the
2056 maximum possible needed (4 result bytes per Unicode character), and return
2057 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002058*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002059PyObject *
2060PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002061 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063{
Tim Peters602f7402002-04-27 18:03:26 +00002064#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002065
Guido van Rossum98297ee2007-11-06 21:34:58 +00002066 Py_ssize_t i; /* index into s of next input byte */
2067 PyObject *result; /* result string object */
2068 char *p; /* next free byte in output buffer */
2069 Py_ssize_t nallocated; /* number of result bytes allocated */
2070 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002071 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002072
Tim Peters602f7402002-04-27 18:03:26 +00002073 assert(s != NULL);
2074 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075
Tim Peters602f7402002-04-27 18:03:26 +00002076 if (size <= MAX_SHORT_UNICHARS) {
2077 /* Write into the stack buffer; nallocated can't overflow.
2078 * At the end, we'll allocate exactly as much heap space as it
2079 * turns out we need.
2080 */
2081 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002082 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002083 p = stackbuf;
2084 }
2085 else {
2086 /* Overallocate on the heap, and give the excess back at the end. */
2087 nallocated = size * 4;
2088 if (nallocated / 4 != size) /* overflow! */
2089 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002090 result = PyString_FromStringAndSize(NULL, nallocated);
2091 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002092 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002093 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002094 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002095
Tim Peters602f7402002-04-27 18:03:26 +00002096 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002097 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002098
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002099 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002100 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002102
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002104 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002105 *p++ = (char)(0xc0 | (ch >> 6));
2106 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002107 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002108 else {
Tim Peters602f7402002-04-27 18:03:26 +00002109 /* Encode UCS2 Unicode ordinals */
2110 if (ch < 0x10000) {
2111 /* Special case: check for high surrogate */
2112 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2113 Py_UCS4 ch2 = s[i];
2114 /* Check for low surrogate and combine the two to
2115 form a UCS4 value */
2116 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002117 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002118 i++;
2119 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 }
Tim Peters602f7402002-04-27 18:03:26 +00002121 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002122 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002123 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002124 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2125 *p++ = (char)(0x80 | (ch & 0x3f));
2126 continue;
2127 }
2128encodeUCS4:
2129 /* Encode UCS4 Unicode ordinals */
2130 *p++ = (char)(0xf0 | (ch >> 18));
2131 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2132 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2133 *p++ = (char)(0x80 | (ch & 0x3f));
2134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002136
Guido van Rossum98297ee2007-11-06 21:34:58 +00002137 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002138 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002139 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002140 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002141 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002142 }
2143 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002144 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002145 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002146 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002147 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002148 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002149 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002150
Tim Peters602f7402002-04-27 18:03:26 +00002151#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152}
2153
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2155{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 if (!PyUnicode_Check(unicode)) {
2157 PyErr_BadArgument();
2158 return NULL;
2159 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002160 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2161 PyUnicode_GET_SIZE(unicode),
2162 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163}
2164
Walter Dörwald41980ca2007-08-16 21:55:45 +00002165/* --- UTF-32 Codec ------------------------------------------------------- */
2166
2167PyObject *
2168PyUnicode_DecodeUTF32(const char *s,
2169 Py_ssize_t size,
2170 const char *errors,
2171 int *byteorder)
2172{
2173 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2174}
2175
2176PyObject *
2177PyUnicode_DecodeUTF32Stateful(const char *s,
2178 Py_ssize_t size,
2179 const char *errors,
2180 int *byteorder,
2181 Py_ssize_t *consumed)
2182{
2183 const char *starts = s;
2184 Py_ssize_t startinpos;
2185 Py_ssize_t endinpos;
2186 Py_ssize_t outpos;
2187 PyUnicodeObject *unicode;
2188 Py_UNICODE *p;
2189#ifndef Py_UNICODE_WIDE
2190 int i, pairs;
2191#else
2192 const int pairs = 0;
2193#endif
2194 const unsigned char *q, *e;
2195 int bo = 0; /* assume native ordering by default */
2196 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002197 /* Offsets from q for retrieving bytes in the right order. */
2198#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2199 int iorder[] = {0, 1, 2, 3};
2200#else
2201 int iorder[] = {3, 2, 1, 0};
2202#endif
2203 PyObject *errorHandler = NULL;
2204 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002205 /* On narrow builds we split characters outside the BMP into two
2206 codepoints => count how much extra space we need. */
2207#ifndef Py_UNICODE_WIDE
2208 for (i = pairs = 0; i < size/4; i++)
2209 if (((Py_UCS4 *)s)[i] >= 0x10000)
2210 pairs++;
2211#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002212
2213 /* This might be one to much, because of a BOM */
2214 unicode = _PyUnicode_New((size+3)/4+pairs);
2215 if (!unicode)
2216 return NULL;
2217 if (size == 0)
2218 return (PyObject *)unicode;
2219
2220 /* Unpack UTF-32 encoded data */
2221 p = unicode->str;
2222 q = (unsigned char *)s;
2223 e = q + size;
2224
2225 if (byteorder)
2226 bo = *byteorder;
2227
2228 /* Check for BOM marks (U+FEFF) in the input and adjust current
2229 byte order setting accordingly. In native mode, the leading BOM
2230 mark is skipped, in all other modes, it is copied to the output
2231 stream as-is (giving a ZWNBSP character). */
2232 if (bo == 0) {
2233 if (size >= 4) {
2234 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2235 (q[iorder[1]] << 8) | q[iorder[0]];
2236#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2237 if (bom == 0x0000FEFF) {
2238 q += 4;
2239 bo = -1;
2240 }
2241 else if (bom == 0xFFFE0000) {
2242 q += 4;
2243 bo = 1;
2244 }
2245#else
2246 if (bom == 0x0000FEFF) {
2247 q += 4;
2248 bo = 1;
2249 }
2250 else if (bom == 0xFFFE0000) {
2251 q += 4;
2252 bo = -1;
2253 }
2254#endif
2255 }
2256 }
2257
2258 if (bo == -1) {
2259 /* force LE */
2260 iorder[0] = 0;
2261 iorder[1] = 1;
2262 iorder[2] = 2;
2263 iorder[3] = 3;
2264 }
2265 else if (bo == 1) {
2266 /* force BE */
2267 iorder[0] = 3;
2268 iorder[1] = 2;
2269 iorder[2] = 1;
2270 iorder[3] = 0;
2271 }
2272
2273 while (q < e) {
2274 Py_UCS4 ch;
2275 /* remaining bytes at the end? (size should be divisible by 4) */
2276 if (e-q<4) {
2277 if (consumed)
2278 break;
2279 errmsg = "truncated data";
2280 startinpos = ((const char *)q)-starts;
2281 endinpos = ((const char *)e)-starts;
2282 goto utf32Error;
2283 /* The remaining input chars are ignored if the callback
2284 chooses to skip the input */
2285 }
2286 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2287 (q[iorder[1]] << 8) | q[iorder[0]];
2288
2289 if (ch >= 0x110000)
2290 {
2291 errmsg = "codepoint not in range(0x110000)";
2292 startinpos = ((const char *)q)-starts;
2293 endinpos = startinpos+4;
2294 goto utf32Error;
2295 }
2296#ifndef Py_UNICODE_WIDE
2297 if (ch >= 0x10000)
2298 {
2299 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2300 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2301 }
2302 else
2303#endif
2304 *p++ = ch;
2305 q += 4;
2306 continue;
2307 utf32Error:
2308 outpos = p-PyUnicode_AS_UNICODE(unicode);
2309 if (unicode_decode_call_errorhandler(
2310 errors, &errorHandler,
2311 "utf32", errmsg,
2312 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2313 (PyObject **)&unicode, &outpos, &p))
2314 goto onError;
2315 }
2316
2317 if (byteorder)
2318 *byteorder = bo;
2319
2320 if (consumed)
2321 *consumed = (const char *)q-starts;
2322
2323 /* Adjust length */
2324 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2325 goto onError;
2326
2327 Py_XDECREF(errorHandler);
2328 Py_XDECREF(exc);
2329 return (PyObject *)unicode;
2330
2331onError:
2332 Py_DECREF(unicode);
2333 Py_XDECREF(errorHandler);
2334 Py_XDECREF(exc);
2335 return NULL;
2336}
2337
2338PyObject *
2339PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2340 Py_ssize_t size,
2341 const char *errors,
2342 int byteorder)
2343{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002344 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002345 unsigned char *p;
2346#ifndef Py_UNICODE_WIDE
2347 int i, pairs;
2348#else
2349 const int pairs = 0;
2350#endif
2351 /* Offsets from p for storing byte pairs in the right order. */
2352#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2353 int iorder[] = {0, 1, 2, 3};
2354#else
2355 int iorder[] = {3, 2, 1, 0};
2356#endif
2357
2358#define STORECHAR(CH) \
2359 do { \
2360 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2361 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2362 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2363 p[iorder[0]] = (CH) & 0xff; \
2364 p += 4; \
2365 } while(0)
2366
2367 /* In narrow builds we can output surrogate pairs as one codepoint,
2368 so we need less space. */
2369#ifndef Py_UNICODE_WIDE
2370 for (i = pairs = 0; i < size-1; i++)
2371 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2372 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2373 pairs++;
2374#endif
2375 v = PyBytes_FromStringAndSize(NULL,
2376 4 * (size - pairs + (byteorder == 0)));
2377 if (v == NULL)
2378 return NULL;
2379
2380 p = (unsigned char *)PyBytes_AS_STRING(v);
2381 if (byteorder == 0)
2382 STORECHAR(0xFEFF);
2383 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002384 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002385
2386 if (byteorder == -1) {
2387 /* force LE */
2388 iorder[0] = 0;
2389 iorder[1] = 1;
2390 iorder[2] = 2;
2391 iorder[3] = 3;
2392 }
2393 else if (byteorder == 1) {
2394 /* force BE */
2395 iorder[0] = 3;
2396 iorder[1] = 2;
2397 iorder[2] = 1;
2398 iorder[3] = 0;
2399 }
2400
2401 while (size-- > 0) {
2402 Py_UCS4 ch = *s++;
2403#ifndef Py_UNICODE_WIDE
2404 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2405 Py_UCS4 ch2 = *s;
2406 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2407 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2408 s++;
2409 size--;
2410 }
2411 }
2412#endif
2413 STORECHAR(ch);
2414 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002415
2416 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002417 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002418 Py_DECREF(v);
2419 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002420#undef STORECHAR
2421}
2422
2423PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2424{
2425 if (!PyUnicode_Check(unicode)) {
2426 PyErr_BadArgument();
2427 return NULL;
2428 }
2429 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2430 PyUnicode_GET_SIZE(unicode),
2431 NULL,
2432 0);
2433}
2434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435/* --- UTF-16 Codec ------------------------------------------------------- */
2436
Tim Peters772747b2001-08-09 22:21:55 +00002437PyObject *
2438PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002439 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002440 const char *errors,
2441 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442{
Walter Dörwald69652032004-09-07 20:24:22 +00002443 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2444}
2445
2446PyObject *
2447PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002448 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002449 const char *errors,
2450 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002451 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002452{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002453 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002454 Py_ssize_t startinpos;
2455 Py_ssize_t endinpos;
2456 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457 PyUnicodeObject *unicode;
2458 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002459 const unsigned char *q, *e;
2460 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002461 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002462 /* Offsets from q for retrieving byte pairs in the right order. */
2463#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2464 int ihi = 1, ilo = 0;
2465#else
2466 int ihi = 0, ilo = 1;
2467#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 PyObject *errorHandler = NULL;
2469 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470
2471 /* Note: size will always be longer than the resulting Unicode
2472 character count */
2473 unicode = _PyUnicode_New(size);
2474 if (!unicode)
2475 return NULL;
2476 if (size == 0)
2477 return (PyObject *)unicode;
2478
2479 /* Unpack UTF-16 encoded data */
2480 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002481 q = (unsigned char *)s;
2482 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483
2484 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002485 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002487 /* Check for BOM marks (U+FEFF) in the input and adjust current
2488 byte order setting accordingly. In native mode, the leading BOM
2489 mark is skipped, in all other modes, it is copied to the output
2490 stream as-is (giving a ZWNBSP character). */
2491 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002492 if (size >= 2) {
2493 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002494#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002495 if (bom == 0xFEFF) {
2496 q += 2;
2497 bo = -1;
2498 }
2499 else if (bom == 0xFFFE) {
2500 q += 2;
2501 bo = 1;
2502 }
Tim Petersced69f82003-09-16 20:30:58 +00002503#else
Walter Dörwald69652032004-09-07 20:24:22 +00002504 if (bom == 0xFEFF) {
2505 q += 2;
2506 bo = 1;
2507 }
2508 else if (bom == 0xFFFE) {
2509 q += 2;
2510 bo = -1;
2511 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002512#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002513 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515
Tim Peters772747b2001-08-09 22:21:55 +00002516 if (bo == -1) {
2517 /* force LE */
2518 ihi = 1;
2519 ilo = 0;
2520 }
2521 else if (bo == 1) {
2522 /* force BE */
2523 ihi = 0;
2524 ilo = 1;
2525 }
2526
2527 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002528 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002529 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002530 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002531 if (consumed)
2532 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 errmsg = "truncated data";
2534 startinpos = ((const char *)q)-starts;
2535 endinpos = ((const char *)e)-starts;
2536 goto utf16Error;
2537 /* The remaining input chars are ignored if the callback
2538 chooses to skip the input */
2539 }
2540 ch = (q[ihi] << 8) | q[ilo];
2541
Tim Peters772747b2001-08-09 22:21:55 +00002542 q += 2;
2543
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 if (ch < 0xD800 || ch > 0xDFFF) {
2545 *p++ = ch;
2546 continue;
2547 }
2548
2549 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002550 if (q >= e) {
2551 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002552 startinpos = (((const char *)q)-2)-starts;
2553 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002554 goto utf16Error;
2555 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002556 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002557 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2558 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002559 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002560#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002561 *p++ = ch;
2562 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002563#else
2564 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002565#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002566 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002567 }
2568 else {
2569 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 startinpos = (((const char *)q)-4)-starts;
2571 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002572 goto utf16Error;
2573 }
2574
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002576 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 startinpos = (((const char *)q)-2)-starts;
2578 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002579 /* Fall through to report the error */
2580
2581 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002582 outpos = p-PyUnicode_AS_UNICODE(unicode);
2583 if (unicode_decode_call_errorhandler(
2584 errors, &errorHandler,
2585 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002586 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002588 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 }
2590
2591 if (byteorder)
2592 *byteorder = bo;
2593
Walter Dörwald69652032004-09-07 20:24:22 +00002594 if (consumed)
2595 *consumed = (const char *)q-starts;
2596
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002598 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 goto onError;
2600
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002601 Py_XDECREF(errorHandler);
2602 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 return (PyObject *)unicode;
2604
2605onError:
2606 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 Py_XDECREF(errorHandler);
2608 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 return NULL;
2610}
2611
Tim Peters772747b2001-08-09 22:21:55 +00002612PyObject *
2613PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002614 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002615 const char *errors,
2616 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002618 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002619 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002620#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002621 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002622#else
2623 const int pairs = 0;
2624#endif
Tim Peters772747b2001-08-09 22:21:55 +00002625 /* Offsets from p for storing byte pairs in the right order. */
2626#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2627 int ihi = 1, ilo = 0;
2628#else
2629 int ihi = 0, ilo = 1;
2630#endif
2631
2632#define STORECHAR(CH) \
2633 do { \
2634 p[ihi] = ((CH) >> 8) & 0xff; \
2635 p[ilo] = (CH) & 0xff; \
2636 p += 2; \
2637 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002639#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002640 for (i = pairs = 0; i < size; i++)
2641 if (s[i] >= 0x10000)
2642 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002643#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002644 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002645 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 if (v == NULL)
2647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648
Walter Dörwald3cc34522007-05-04 10:48:27 +00002649 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002651 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002652 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002653 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002654
2655 if (byteorder == -1) {
2656 /* force LE */
2657 ihi = 1;
2658 ilo = 0;
2659 }
2660 else if (byteorder == 1) {
2661 /* force BE */
2662 ihi = 0;
2663 ilo = 1;
2664 }
2665
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002666 while (size-- > 0) {
2667 Py_UNICODE ch = *s++;
2668 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002669#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002670 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002671 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2672 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002674#endif
Tim Peters772747b2001-08-09 22:21:55 +00002675 STORECHAR(ch);
2676 if (ch2)
2677 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002678 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002679
2680 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002681 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002682 Py_DECREF(v);
2683 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002684#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685}
2686
2687PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2688{
2689 if (!PyUnicode_Check(unicode)) {
2690 PyErr_BadArgument();
2691 return NULL;
2692 }
2693 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2694 PyUnicode_GET_SIZE(unicode),
2695 NULL,
2696 0);
2697}
2698
2699/* --- Unicode Escape Codec ----------------------------------------------- */
2700
Fredrik Lundh06d12682001-01-24 07:59:11 +00002701static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002702
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002704 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 const char *errors)
2706{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002708 Py_ssize_t startinpos;
2709 Py_ssize_t endinpos;
2710 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002715 char* message;
2716 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 PyObject *errorHandler = NULL;
2718 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002719
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 /* Escaped strings will always be longer than the resulting
2721 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 length after conversion to the true value.
2723 (but if the error callback returns a long replacement string
2724 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 v = _PyUnicode_New(size);
2726 if (v == NULL)
2727 goto onError;
2728 if (size == 0)
2729 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 while (s < end) {
2735 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002736 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738
2739 /* Non-escape characters are interpreted as Unicode ordinals */
2740 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002741 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 continue;
2743 }
2744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 /* \ - Escapes */
2747 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002748 c = *s++;
2749 if (s > end)
2750 c = '\0'; /* Invalid after \ */
2751 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752
2753 /* \x escapes */
2754 case '\n': break;
2755 case '\\': *p++ = '\\'; break;
2756 case '\'': *p++ = '\''; break;
2757 case '\"': *p++ = '\"'; break;
2758 case 'b': *p++ = '\b'; break;
2759 case 'f': *p++ = '\014'; break; /* FF */
2760 case 't': *p++ = '\t'; break;
2761 case 'n': *p++ = '\n'; break;
2762 case 'r': *p++ = '\r'; break;
2763 case 'v': *p++ = '\013'; break; /* VT */
2764 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2765
2766 /* \OOO (octal) escapes */
2767 case '0': case '1': case '2': case '3':
2768 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002769 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002770 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002771 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002772 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002773 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002775 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 break;
2777
Fredrik Lundhccc74732001-02-18 22:13:49 +00002778 /* hex escapes */
2779 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002781 digits = 2;
2782 message = "truncated \\xXX escape";
2783 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784
Fredrik Lundhccc74732001-02-18 22:13:49 +00002785 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002787 digits = 4;
2788 message = "truncated \\uXXXX escape";
2789 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790
Fredrik Lundhccc74732001-02-18 22:13:49 +00002791 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002792 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 digits = 8;
2794 message = "truncated \\UXXXXXXXX escape";
2795 hexescape:
2796 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002797 outpos = p-PyUnicode_AS_UNICODE(v);
2798 if (s+digits>end) {
2799 endinpos = size;
2800 if (unicode_decode_call_errorhandler(
2801 errors, &errorHandler,
2802 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002803 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002804 (PyObject **)&v, &outpos, &p))
2805 goto onError;
2806 goto nextByte;
2807 }
2808 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002809 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002810 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 endinpos = (s+i+1)-starts;
2812 if (unicode_decode_call_errorhandler(
2813 errors, &errorHandler,
2814 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002815 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002817 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002819 }
2820 chr = (chr<<4) & ~0xF;
2821 if (c >= '0' && c <= '9')
2822 chr += c - '0';
2823 else if (c >= 'a' && c <= 'f')
2824 chr += 10 + c - 'a';
2825 else
2826 chr += 10 + c - 'A';
2827 }
2828 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002829 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 /* _decoding_error will have already written into the
2831 target buffer. */
2832 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002833 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002834 /* when we get here, chr is a 32-bit unicode character */
2835 if (chr <= 0xffff)
2836 /* UCS-2 character */
2837 *p++ = (Py_UNICODE) chr;
2838 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002839 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002840 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002841#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002842 *p++ = chr;
2843#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002844 chr -= 0x10000L;
2845 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002846 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002847#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002848 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 endinpos = s-starts;
2850 outpos = p-PyUnicode_AS_UNICODE(v);
2851 if (unicode_decode_call_errorhandler(
2852 errors, &errorHandler,
2853 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002854 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002856 goto onError;
2857 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002858 break;
2859
2860 /* \N{name} */
2861 case 'N':
2862 message = "malformed \\N character escape";
2863 if (ucnhash_CAPI == NULL) {
2864 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002865 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002866 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002867 if (m == NULL)
2868 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002869 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002870 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002871 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002872 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002873 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002874 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002875 if (ucnhash_CAPI == NULL)
2876 goto ucnhashError;
2877 }
2878 if (*s == '{') {
2879 const char *start = s+1;
2880 /* look for the closing brace */
2881 while (*s != '}' && s < end)
2882 s++;
2883 if (s > start && s < end && *s == '}') {
2884 /* found a name. look it up in the unicode database */
2885 message = "unknown Unicode character name";
2886 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002887 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 goto store;
2889 }
2890 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002891 endinpos = s-starts;
2892 outpos = p-PyUnicode_AS_UNICODE(v);
2893 if (unicode_decode_call_errorhandler(
2894 errors, &errorHandler,
2895 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002896 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002897 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002898 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002899 break;
2900
2901 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002902 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 message = "\\ at end of string";
2904 s--;
2905 endinpos = s-starts;
2906 outpos = p-PyUnicode_AS_UNICODE(v);
2907 if (unicode_decode_call_errorhandler(
2908 errors, &errorHandler,
2909 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002910 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002912 goto onError;
2913 }
2914 else {
2915 *p++ = '\\';
2916 *p++ = (unsigned char)s[-1];
2917 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002918 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 nextByte:
2921 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002923 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002924 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002925 Py_XDECREF(errorHandler);
2926 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002928
Fredrik Lundhccc74732001-02-18 22:13:49 +00002929ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002930 PyErr_SetString(
2931 PyExc_UnicodeError,
2932 "\\N escapes not supported (can't load unicodedata module)"
2933 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002934 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002935 Py_XDECREF(errorHandler);
2936 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002937 return NULL;
2938
Fredrik Lundhccc74732001-02-18 22:13:49 +00002939onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 return NULL;
2944}
2945
2946/* Return a Unicode-Escape string version of the Unicode object.
2947
2948 If quotes is true, the string is enclosed in u"" or u'' quotes as
2949 appropriate.
2950
2951*/
2952
Thomas Wouters477c8d52006-05-27 19:21:47 +00002953Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2954 Py_ssize_t size,
2955 Py_UNICODE ch)
2956{
2957 /* like wcschr, but doesn't stop at NULL characters */
2958
2959 while (size-- > 0) {
2960 if (*s == ch)
2961 return s;
2962 s++;
2963 }
2964
2965 return NULL;
2966}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002967
Walter Dörwald79e913e2007-05-12 11:08:06 +00002968static const char *hexdigits = "0123456789abcdef";
2969
2970PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2971 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002973 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975
Thomas Wouters89f507f2006-12-13 04:49:30 +00002976 /* XXX(nnorwitz): rather than over-allocating, it would be
2977 better to choose a different scheme. Perhaps scan the
2978 first N-chars of the string and allocate based on that size.
2979 */
2980 /* Initial allocation is based on the longest-possible unichr
2981 escape.
2982
2983 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2984 unichr, so in this case it's the longest unichr escape. In
2985 narrow (UTF-16) builds this is five chars per source unichr
2986 since there are two unichrs in the surrogate pair, so in narrow
2987 (UTF-16) builds it's not the longest unichr escape.
2988
2989 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2990 so in the narrow (UTF-16) build case it's the longest unichr
2991 escape.
2992 */
2993
Walter Dörwald79e913e2007-05-12 11:08:06 +00002994 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002995#ifdef Py_UNICODE_WIDE
2996 + 10*size
2997#else
2998 + 6*size
2999#endif
3000 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 if (repr == NULL)
3002 return NULL;
3003
Walter Dörwald79e913e2007-05-12 11:08:06 +00003004 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 while (size-- > 0) {
3007 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003008
Walter Dörwald79e913e2007-05-12 11:08:06 +00003009 /* Escape backslashes */
3010 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 *p++ = '\\';
3012 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003013 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003014 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003015
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003016#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003017 /* Map 21-bit characters to '\U00xxxxxx' */
3018 else if (ch >= 0x10000) {
3019 *p++ = '\\';
3020 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003021 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3022 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3023 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3024 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3025 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3026 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3027 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3028 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003029 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003030 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003031#else
3032 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003033 else if (ch >= 0xD800 && ch < 0xDC00) {
3034 Py_UNICODE ch2;
3035 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003036
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003037 ch2 = *s++;
3038 size--;
3039 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3040 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3041 *p++ = '\\';
3042 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003043 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3044 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3045 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3046 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3047 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3048 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3049 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3050 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003051 continue;
3052 }
3053 /* Fall through: isolated surrogates are copied as-is */
3054 s--;
3055 size++;
3056 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003057#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003058
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003060 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 *p++ = '\\';
3062 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003063 *p++ = hexdigits[(ch >> 12) & 0x000F];
3064 *p++ = hexdigits[(ch >> 8) & 0x000F];
3065 *p++ = hexdigits[(ch >> 4) & 0x000F];
3066 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003068
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003069 /* Map special whitespace to '\t', \n', '\r' */
3070 else if (ch == '\t') {
3071 *p++ = '\\';
3072 *p++ = 't';
3073 }
3074 else if (ch == '\n') {
3075 *p++ = '\\';
3076 *p++ = 'n';
3077 }
3078 else if (ch == '\r') {
3079 *p++ = '\\';
3080 *p++ = 'r';
3081 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003082
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003083 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003084 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003086 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003087 *p++ = hexdigits[(ch >> 4) & 0x000F];
3088 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003089 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003090
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 /* Copy everything else as-is */
3092 else
3093 *p++ = (char) ch;
3094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095
Guido van Rossum98297ee2007-11-06 21:34:58 +00003096 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3097 p - PyBytes_AS_STRING(repr));
3098 Py_DECREF(repr);
3099 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100}
3101
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3103{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003104 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 if (!PyUnicode_Check(unicode)) {
3106 PyErr_BadArgument();
3107 return NULL;
3108 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003109 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3110 PyUnicode_GET_SIZE(unicode));
3111
3112 if (!s)
3113 return NULL;
3114 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3115 PyBytes_GET_SIZE(s));
3116 Py_DECREF(s);
3117 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118}
3119
3120/* --- Raw Unicode Escape Codec ------------------------------------------- */
3121
3122PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003123 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 const char *errors)
3125{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003127 Py_ssize_t startinpos;
3128 Py_ssize_t endinpos;
3129 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003131 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 const char *end;
3133 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003134 PyObject *errorHandler = NULL;
3135 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003136
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 /* Escaped strings will always be longer than the resulting
3138 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003139 length after conversion to the true value. (But decoding error
3140 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 v = _PyUnicode_New(size);
3142 if (v == NULL)
3143 goto onError;
3144 if (size == 0)
3145 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003146 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 end = s + size;
3148 while (s < end) {
3149 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003150 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003152 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153
3154 /* Non-escape characters are interpreted as Unicode ordinals */
3155 if (*s != '\\') {
3156 *p++ = (unsigned char)*s++;
3157 continue;
3158 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160
3161 /* \u-escapes are only interpreted iff the number of leading
3162 backslashes if odd */
3163 bs = s;
3164 for (;s < end;) {
3165 if (*s != '\\')
3166 break;
3167 *p++ = (unsigned char)*s++;
3168 }
3169 if (((s - bs) & 1) == 0 ||
3170 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003171 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 continue;
3173 }
3174 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003175 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 s++;
3177
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003178 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003180 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003182 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003183 endinpos = s-starts;
3184 if (unicode_decode_call_errorhandler(
3185 errors, &errorHandler,
3186 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003187 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003188 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 }
3192 x = (x<<4) & ~0xF;
3193 if (c >= '0' && c <= '9')
3194 x += c - '0';
3195 else if (c >= 'a' && c <= 'f')
3196 x += 10 + c - 'a';
3197 else
3198 x += 10 + c - 'A';
3199 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003200 if (x <= 0xffff)
3201 /* UCS-2 character */
3202 *p++ = (Py_UNICODE) x;
3203 else if (x <= 0x10ffff) {
3204 /* UCS-4 character. Either store directly, or as
3205 surrogate pair. */
3206#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003207 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003208#else
3209 x -= 0x10000L;
3210 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3211 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3212#endif
3213 } else {
3214 endinpos = s-starts;
3215 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003216 if (unicode_decode_call_errorhandler(
3217 errors, &errorHandler,
3218 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003219 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003220 (PyObject **)&v, &outpos, &p))
3221 goto onError;
3222 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 nextByte:
3224 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003226 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003227 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228 Py_XDECREF(errorHandler);
3229 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003231
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 onError:
3233 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 Py_XDECREF(errorHandler);
3235 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 return NULL;
3237}
3238
3239PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003240 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003242 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 char *p;
3244 char *q;
3245
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003246#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003247 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003248#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003249 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003250#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251 if (repr == NULL)
3252 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003253 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003254 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255
Walter Dörwald711005d2007-05-12 12:03:26 +00003256 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 while (size-- > 0) {
3258 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003259#ifdef Py_UNICODE_WIDE
3260 /* Map 32-bit characters to '\Uxxxxxxxx' */
3261 if (ch >= 0x10000) {
3262 *p++ = '\\';
3263 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003264 *p++ = hexdigits[(ch >> 28) & 0xf];
3265 *p++ = hexdigits[(ch >> 24) & 0xf];
3266 *p++ = hexdigits[(ch >> 20) & 0xf];
3267 *p++ = hexdigits[(ch >> 16) & 0xf];
3268 *p++ = hexdigits[(ch >> 12) & 0xf];
3269 *p++ = hexdigits[(ch >> 8) & 0xf];
3270 *p++ = hexdigits[(ch >> 4) & 0xf];
3271 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003272 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003273 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003274#else
3275 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3276 if (ch >= 0xD800 && ch < 0xDC00) {
3277 Py_UNICODE ch2;
3278 Py_UCS4 ucs;
3279
3280 ch2 = *s++;
3281 size--;
3282 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3283 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3284 *p++ = '\\';
3285 *p++ = 'U';
3286 *p++ = hexdigits[(ucs >> 28) & 0xf];
3287 *p++ = hexdigits[(ucs >> 24) & 0xf];
3288 *p++ = hexdigits[(ucs >> 20) & 0xf];
3289 *p++ = hexdigits[(ucs >> 16) & 0xf];
3290 *p++ = hexdigits[(ucs >> 12) & 0xf];
3291 *p++ = hexdigits[(ucs >> 8) & 0xf];
3292 *p++ = hexdigits[(ucs >> 4) & 0xf];
3293 *p++ = hexdigits[ucs & 0xf];
3294 continue;
3295 }
3296 /* Fall through: isolated surrogates are copied as-is */
3297 s--;
3298 size++;
3299 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003300#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 /* Map 16-bit characters to '\uxxxx' */
3302 if (ch >= 256) {
3303 *p++ = '\\';
3304 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003305 *p++ = hexdigits[(ch >> 12) & 0xf];
3306 *p++ = hexdigits[(ch >> 8) & 0xf];
3307 *p++ = hexdigits[(ch >> 4) & 0xf];
3308 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 }
3310 /* Copy everything else as-is */
3311 else
3312 *p++ = (char) ch;
3313 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003314 size = p - q;
3315
3316 done:
3317 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3318 Py_DECREF(repr);
3319 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320}
3321
3322PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3323{
Walter Dörwald711005d2007-05-12 12:03:26 +00003324 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003326 PyErr_BadArgument();
3327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003329 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3330 PyUnicode_GET_SIZE(unicode));
3331
3332 if (!s)
3333 return NULL;
3334 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3335 PyBytes_GET_SIZE(s));
3336 Py_DECREF(s);
3337 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338}
3339
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003340/* --- Unicode Internal Codec ------------------------------------------- */
3341
3342PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003343 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003344 const char *errors)
3345{
3346 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003347 Py_ssize_t startinpos;
3348 Py_ssize_t endinpos;
3349 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003350 PyUnicodeObject *v;
3351 Py_UNICODE *p;
3352 const char *end;
3353 const char *reason;
3354 PyObject *errorHandler = NULL;
3355 PyObject *exc = NULL;
3356
Neal Norwitzd43069c2006-01-08 01:12:10 +00003357#ifdef Py_UNICODE_WIDE
3358 Py_UNICODE unimax = PyUnicode_GetMax();
3359#endif
3360
Thomas Wouters89f507f2006-12-13 04:49:30 +00003361 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003362 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3363 if (v == NULL)
3364 goto onError;
3365 if (PyUnicode_GetSize((PyObject *)v) == 0)
3366 return (PyObject *)v;
3367 p = PyUnicode_AS_UNICODE(v);
3368 end = s + size;
3369
3370 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003371 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003372 /* We have to sanity check the raw data, otherwise doom looms for
3373 some malformed UCS-4 data. */
3374 if (
3375 #ifdef Py_UNICODE_WIDE
3376 *p > unimax || *p < 0 ||
3377 #endif
3378 end-s < Py_UNICODE_SIZE
3379 )
3380 {
3381 startinpos = s - starts;
3382 if (end-s < Py_UNICODE_SIZE) {
3383 endinpos = end-starts;
3384 reason = "truncated input";
3385 }
3386 else {
3387 endinpos = s - starts + Py_UNICODE_SIZE;
3388 reason = "illegal code point (> 0x10FFFF)";
3389 }
3390 outpos = p - PyUnicode_AS_UNICODE(v);
3391 if (unicode_decode_call_errorhandler(
3392 errors, &errorHandler,
3393 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003394 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003395 (PyObject **)&v, &outpos, &p)) {
3396 goto onError;
3397 }
3398 }
3399 else {
3400 p++;
3401 s += Py_UNICODE_SIZE;
3402 }
3403 }
3404
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003405 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003406 goto onError;
3407 Py_XDECREF(errorHandler);
3408 Py_XDECREF(exc);
3409 return (PyObject *)v;
3410
3411 onError:
3412 Py_XDECREF(v);
3413 Py_XDECREF(errorHandler);
3414 Py_XDECREF(exc);
3415 return NULL;
3416}
3417
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418/* --- Latin-1 Codec ------------------------------------------------------ */
3419
3420PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003421 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 const char *errors)
3423{
3424 PyUnicodeObject *v;
3425 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003426
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003428 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003429 Py_UNICODE r = *(unsigned char*)s;
3430 return PyUnicode_FromUnicode(&r, 1);
3431 }
3432
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 v = _PyUnicode_New(size);
3434 if (v == NULL)
3435 goto onError;
3436 if (size == 0)
3437 return (PyObject *)v;
3438 p = PyUnicode_AS_UNICODE(v);
3439 while (size-- > 0)
3440 *p++ = (unsigned char)*s++;
3441 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003442
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443 onError:
3444 Py_XDECREF(v);
3445 return NULL;
3446}
3447
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448/* create or adjust a UnicodeEncodeError */
3449static void make_encode_exception(PyObject **exceptionObject,
3450 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003451 const Py_UNICODE *unicode, Py_ssize_t size,
3452 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003455 if (*exceptionObject == NULL) {
3456 *exceptionObject = PyUnicodeEncodeError_Create(
3457 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 }
3459 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3461 goto onError;
3462 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3463 goto onError;
3464 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3465 goto onError;
3466 return;
3467 onError:
3468 Py_DECREF(*exceptionObject);
3469 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 }
3471}
3472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473/* raises a UnicodeEncodeError */
3474static void raise_encode_exception(PyObject **exceptionObject,
3475 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003476 const Py_UNICODE *unicode, Py_ssize_t size,
3477 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 const char *reason)
3479{
3480 make_encode_exception(exceptionObject,
3481 encoding, unicode, size, startpos, endpos, reason);
3482 if (*exceptionObject != NULL)
3483 PyCodec_StrictErrors(*exceptionObject);
3484}
3485
3486/* error handling callback helper:
3487 build arguments, call the callback and check the arguments,
3488 put the result into newpos and return the replacement string, which
3489 has to be freed by the caller */
3490static PyObject *unicode_encode_call_errorhandler(const char *errors,
3491 PyObject **errorHandler,
3492 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003493 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3494 Py_ssize_t startpos, Py_ssize_t endpos,
3495 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003497 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498
3499 PyObject *restuple;
3500 PyObject *resunicode;
3501
3502 if (*errorHandler == NULL) {
3503 *errorHandler = PyCodec_LookupError(errors);
3504 if (*errorHandler == NULL)
3505 return NULL;
3506 }
3507
3508 make_encode_exception(exceptionObject,
3509 encoding, unicode, size, startpos, endpos, reason);
3510 if (*exceptionObject == NULL)
3511 return NULL;
3512
3513 restuple = PyObject_CallFunctionObjArgs(
3514 *errorHandler, *exceptionObject, NULL);
3515 if (restuple == NULL)
3516 return NULL;
3517 if (!PyTuple_Check(restuple)) {
3518 PyErr_Format(PyExc_TypeError, &argparse[4]);
3519 Py_DECREF(restuple);
3520 return NULL;
3521 }
3522 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3523 &resunicode, newpos)) {
3524 Py_DECREF(restuple);
3525 return NULL;
3526 }
3527 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003528 *newpos = size+*newpos;
3529 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003530 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003531 Py_DECREF(restuple);
3532 return NULL;
3533 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 Py_INCREF(resunicode);
3535 Py_DECREF(restuple);
3536 return resunicode;
3537}
3538
3539static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003540 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 const char *errors,
3542 int limit)
3543{
3544 /* output object */
3545 PyObject *res;
3546 /* pointers to the beginning and end+1 of input */
3547 const Py_UNICODE *startp = p;
3548 const Py_UNICODE *endp = p + size;
3549 /* pointer to the beginning of the unencodable characters */
3550 /* const Py_UNICODE *badp = NULL; */
3551 /* pointer into the output */
3552 char *str;
3553 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003554 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003555 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3556 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 PyObject *errorHandler = NULL;
3558 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003559 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 /* the following variable is used for caching string comparisons
3561 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3562 int known_errorHandler = -1;
3563
3564 /* allocate enough for a simple encoding without
3565 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003566 if (size == 0)
3567 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003568 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003570 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003571 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 ressize = size;
3573
3574 while (p<endp) {
3575 Py_UNICODE c = *p;
3576
3577 /* can we encode this? */
3578 if (c<limit) {
3579 /* no overflow check, because we know that the space is enough */
3580 *str++ = (char)c;
3581 ++p;
3582 }
3583 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003584 Py_ssize_t unicodepos = p-startp;
3585 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003587 Py_ssize_t repsize;
3588 Py_ssize_t newpos;
3589 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 Py_UNICODE *uni2;
3591 /* startpos for collecting unencodable chars */
3592 const Py_UNICODE *collstart = p;
3593 const Py_UNICODE *collend = p;
3594 /* find all unecodable characters */
3595 while ((collend < endp) && ((*collend)>=limit))
3596 ++collend;
3597 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3598 if (known_errorHandler==-1) {
3599 if ((errors==NULL) || (!strcmp(errors, "strict")))
3600 known_errorHandler = 1;
3601 else if (!strcmp(errors, "replace"))
3602 known_errorHandler = 2;
3603 else if (!strcmp(errors, "ignore"))
3604 known_errorHandler = 3;
3605 else if (!strcmp(errors, "xmlcharrefreplace"))
3606 known_errorHandler = 4;
3607 else
3608 known_errorHandler = 0;
3609 }
3610 switch (known_errorHandler) {
3611 case 1: /* strict */
3612 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3613 goto onError;
3614 case 2: /* replace */
3615 while (collstart++<collend)
3616 *str++ = '?'; /* fall through */
3617 case 3: /* ignore */
3618 p = collend;
3619 break;
3620 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003621 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 /* determine replacement size (temporarily (mis)uses p) */
3623 for (p = collstart, repsize = 0; p < collend; ++p) {
3624 if (*p<10)
3625 repsize += 2+1+1;
3626 else if (*p<100)
3627 repsize += 2+2+1;
3628 else if (*p<1000)
3629 repsize += 2+3+1;
3630 else if (*p<10000)
3631 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003632#ifndef Py_UNICODE_WIDE
3633 else
3634 repsize += 2+5+1;
3635#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 else if (*p<100000)
3637 repsize += 2+5+1;
3638 else if (*p<1000000)
3639 repsize += 2+6+1;
3640 else
3641 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003642#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 }
3644 requiredsize = respos+repsize+(endp-collend);
3645 if (requiredsize > ressize) {
3646 if (requiredsize<2*ressize)
3647 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003648 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003650 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 ressize = requiredsize;
3652 }
3653 /* generate replacement (temporarily (mis)uses p) */
3654 for (p = collstart; p < collend; ++p) {
3655 str += sprintf(str, "&#%d;", (int)*p);
3656 }
3657 p = collend;
3658 break;
3659 default:
3660 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3661 encoding, reason, startp, size, &exc,
3662 collstart-startp, collend-startp, &newpos);
3663 if (repunicode == NULL)
3664 goto onError;
3665 /* need more space? (at least enough for what we
3666 have+the replacement+the rest of the string, so
3667 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003668 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 repsize = PyUnicode_GET_SIZE(repunicode);
3670 requiredsize = respos+repsize+(endp-collend);
3671 if (requiredsize > ressize) {
3672 if (requiredsize<2*ressize)
3673 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003674 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 Py_DECREF(repunicode);
3676 goto onError;
3677 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003678 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 ressize = requiredsize;
3680 }
3681 /* check if there is anything unencodable in the replacement
3682 and copy it to the output */
3683 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3684 c = *uni2;
3685 if (c >= limit) {
3686 raise_encode_exception(&exc, encoding, startp, size,
3687 unicodepos, unicodepos+1, reason);
3688 Py_DECREF(repunicode);
3689 goto onError;
3690 }
3691 *str = (char)c;
3692 }
3693 p = startp + newpos;
3694 Py_DECREF(repunicode);
3695 }
3696 }
3697 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003698 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3699 str - PyBytes_AS_STRING(res));
3700 onError:
3701 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 Py_XDECREF(errorHandler);
3703 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003704 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705}
3706
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003708 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 const char *errors)
3710{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712}
3713
3714PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3715{
3716 if (!PyUnicode_Check(unicode)) {
3717 PyErr_BadArgument();
3718 return NULL;
3719 }
3720 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3721 PyUnicode_GET_SIZE(unicode),
3722 NULL);
3723}
3724
3725/* --- 7-bit ASCII Codec -------------------------------------------------- */
3726
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003728 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 const char *errors)
3730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 PyUnicodeObject *v;
3733 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003734 Py_ssize_t startinpos;
3735 Py_ssize_t endinpos;
3736 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 const char *e;
3738 PyObject *errorHandler = NULL;
3739 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003740
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003742 if (size == 1 && *(unsigned char*)s < 128) {
3743 Py_UNICODE r = *(unsigned char*)s;
3744 return PyUnicode_FromUnicode(&r, 1);
3745 }
Tim Petersced69f82003-09-16 20:30:58 +00003746
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 v = _PyUnicode_New(size);
3748 if (v == NULL)
3749 goto onError;
3750 if (size == 0)
3751 return (PyObject *)v;
3752 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 e = s + size;
3754 while (s < e) {
3755 register unsigned char c = (unsigned char)*s;
3756 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 ++s;
3759 }
3760 else {
3761 startinpos = s-starts;
3762 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003763 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 if (unicode_decode_call_errorhandler(
3765 errors, &errorHandler,
3766 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003767 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003772 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003773 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003774 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 Py_XDECREF(errorHandler);
3776 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003778
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 onError:
3780 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 Py_XDECREF(errorHandler);
3782 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 return NULL;
3784}
3785
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003787 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 const char *errors)
3789{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791}
3792
3793PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3794{
3795 if (!PyUnicode_Check(unicode)) {
3796 PyErr_BadArgument();
3797 return NULL;
3798 }
3799 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3800 PyUnicode_GET_SIZE(unicode),
3801 NULL);
3802}
3803
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003804#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003805
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003806/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003807
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003808#if SIZEOF_INT < SIZEOF_SSIZE_T
3809#define NEED_RETRY
3810#endif
3811
3812/* XXX This code is limited to "true" double-byte encodings, as
3813 a) it assumes an incomplete character consists of a single byte, and
3814 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3815 encodings, see IsDBCSLeadByteEx documentation. */
3816
3817static int is_dbcs_lead_byte(const char *s, int offset)
3818{
3819 const char *curr = s + offset;
3820
3821 if (IsDBCSLeadByte(*curr)) {
3822 const char *prev = CharPrev(s, curr);
3823 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3824 }
3825 return 0;
3826}
3827
3828/*
3829 * Decode MBCS string into unicode object. If 'final' is set, converts
3830 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3831 */
3832static int decode_mbcs(PyUnicodeObject **v,
3833 const char *s, /* MBCS string */
3834 int size, /* sizeof MBCS string */
3835 int final)
3836{
3837 Py_UNICODE *p;
3838 Py_ssize_t n = 0;
3839 int usize = 0;
3840
3841 assert(size >= 0);
3842
3843 /* Skip trailing lead-byte unless 'final' is set */
3844 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3845 --size;
3846
3847 /* First get the size of the result */
3848 if (size > 0) {
3849 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3850 if (usize == 0) {
3851 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3852 return -1;
3853 }
3854 }
3855
3856 if (*v == NULL) {
3857 /* Create unicode object */
3858 *v = _PyUnicode_New(usize);
3859 if (*v == NULL)
3860 return -1;
3861 }
3862 else {
3863 /* Extend unicode object */
3864 n = PyUnicode_GET_SIZE(*v);
3865 if (_PyUnicode_Resize(v, n + usize) < 0)
3866 return -1;
3867 }
3868
3869 /* Do the conversion */
3870 if (size > 0) {
3871 p = PyUnicode_AS_UNICODE(*v) + n;
3872 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3873 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3874 return -1;
3875 }
3876 }
3877
3878 return size;
3879}
3880
3881PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3882 Py_ssize_t size,
3883 const char *errors,
3884 Py_ssize_t *consumed)
3885{
3886 PyUnicodeObject *v = NULL;
3887 int done;
3888
3889 if (consumed)
3890 *consumed = 0;
3891
3892#ifdef NEED_RETRY
3893 retry:
3894 if (size > INT_MAX)
3895 done = decode_mbcs(&v, s, INT_MAX, 0);
3896 else
3897#endif
3898 done = decode_mbcs(&v, s, (int)size, !consumed);
3899
3900 if (done < 0) {
3901 Py_XDECREF(v);
3902 return NULL;
3903 }
3904
3905 if (consumed)
3906 *consumed += done;
3907
3908#ifdef NEED_RETRY
3909 if (size > INT_MAX) {
3910 s += done;
3911 size -= done;
3912 goto retry;
3913 }
3914#endif
3915
3916 return (PyObject *)v;
3917}
3918
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003919PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003920 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003921 const char *errors)
3922{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003923 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3924}
3925
3926/*
3927 * Convert unicode into string object (MBCS).
3928 * Returns 0 if succeed, -1 otherwise.
3929 */
3930static int encode_mbcs(PyObject **repr,
3931 const Py_UNICODE *p, /* unicode */
3932 int size) /* size of unicode */
3933{
3934 int mbcssize = 0;
3935 Py_ssize_t n = 0;
3936
3937 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003938
3939 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003940 if (size > 0) {
3941 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3942 if (mbcssize == 0) {
3943 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3944 return -1;
3945 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003946 }
3947
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003948 if (*repr == NULL) {
3949 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003950 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003951 if (*repr == NULL)
3952 return -1;
3953 }
3954 else {
3955 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003956 n = PyString_Size(*repr);
3957 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003958 return -1;
3959 }
3960
3961 /* Do the conversion */
3962 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003963 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003964 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3965 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3966 return -1;
3967 }
3968 }
3969
3970 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003971}
3972
3973PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003974 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003975 const char *errors)
3976{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003977 PyObject *repr = NULL;
3978 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003979
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003980#ifdef NEED_RETRY
3981 retry:
3982 if (size > INT_MAX)
3983 ret = encode_mbcs(&repr, p, INT_MAX);
3984 else
3985#endif
3986 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003987
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003988 if (ret < 0) {
3989 Py_XDECREF(repr);
3990 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003991 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003992
3993#ifdef NEED_RETRY
3994 if (size > INT_MAX) {
3995 p += INT_MAX;
3996 size -= INT_MAX;
3997 goto retry;
3998 }
3999#endif
4000
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004001 return repr;
4002}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004003
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004004PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4005{
4006 if (!PyUnicode_Check(unicode)) {
4007 PyErr_BadArgument();
4008 return NULL;
4009 }
4010 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4011 PyUnicode_GET_SIZE(unicode),
4012 NULL);
4013}
4014
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004015#undef NEED_RETRY
4016
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004017#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004018
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019/* --- Character Mapping Codec -------------------------------------------- */
4020
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004022 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 PyObject *mapping,
4024 const char *errors)
4025{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004027 Py_ssize_t startinpos;
4028 Py_ssize_t endinpos;
4029 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 PyUnicodeObject *v;
4032 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004033 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034 PyObject *errorHandler = NULL;
4035 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004036 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004037 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004038
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 /* Default to Latin-1 */
4040 if (mapping == NULL)
4041 return PyUnicode_DecodeLatin1(s, size, errors);
4042
4043 v = _PyUnicode_New(size);
4044 if (v == NULL)
4045 goto onError;
4046 if (size == 0)
4047 return (PyObject *)v;
4048 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004050 if (PyUnicode_CheckExact(mapping)) {
4051 mapstring = PyUnicode_AS_UNICODE(mapping);
4052 maplen = PyUnicode_GET_SIZE(mapping);
4053 while (s < e) {
4054 unsigned char ch = *s;
4055 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004057 if (ch < maplen)
4058 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004060 if (x == 0xfffe) {
4061 /* undefined mapping */
4062 outpos = p-PyUnicode_AS_UNICODE(v);
4063 startinpos = s-starts;
4064 endinpos = startinpos+1;
4065 if (unicode_decode_call_errorhandler(
4066 errors, &errorHandler,
4067 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004068 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004069 (PyObject **)&v, &outpos, &p)) {
4070 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004071 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004072 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004073 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004074 *p++ = x;
4075 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004077 }
4078 else {
4079 while (s < e) {
4080 unsigned char ch = *s;
4081 PyObject *w, *x;
4082
4083 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004084 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004085 if (w == NULL)
4086 goto onError;
4087 x = PyObject_GetItem(mapping, w);
4088 Py_DECREF(w);
4089 if (x == NULL) {
4090 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4091 /* No mapping found means: mapping is undefined. */
4092 PyErr_Clear();
4093 x = Py_None;
4094 Py_INCREF(x);
4095 } else
4096 goto onError;
4097 }
4098
4099 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004100 if (PyLong_Check(x)) {
4101 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004102 if (value < 0 || value > 65535) {
4103 PyErr_SetString(PyExc_TypeError,
4104 "character mapping must be in range(65536)");
4105 Py_DECREF(x);
4106 goto onError;
4107 }
4108 *p++ = (Py_UNICODE)value;
4109 }
4110 else if (x == Py_None) {
4111 /* undefined mapping */
4112 outpos = p-PyUnicode_AS_UNICODE(v);
4113 startinpos = s-starts;
4114 endinpos = startinpos+1;
4115 if (unicode_decode_call_errorhandler(
4116 errors, &errorHandler,
4117 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004118 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004119 (PyObject **)&v, &outpos, &p)) {
4120 Py_DECREF(x);
4121 goto onError;
4122 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004123 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004124 continue;
4125 }
4126 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004127 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004128
4129 if (targetsize == 1)
4130 /* 1-1 mapping */
4131 *p++ = *PyUnicode_AS_UNICODE(x);
4132
4133 else if (targetsize > 1) {
4134 /* 1-n mapping */
4135 if (targetsize > extrachars) {
4136 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004137 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4138 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004139 (targetsize << 2);
4140 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004141 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004142 if (_PyUnicode_Resize(&v,
4143 PyUnicode_GET_SIZE(v) + needed) < 0) {
4144 Py_DECREF(x);
4145 goto onError;
4146 }
4147 p = PyUnicode_AS_UNICODE(v) + oldpos;
4148 }
4149 Py_UNICODE_COPY(p,
4150 PyUnicode_AS_UNICODE(x),
4151 targetsize);
4152 p += targetsize;
4153 extrachars -= targetsize;
4154 }
4155 /* 1-0 mapping: skip the character */
4156 }
4157 else {
4158 /* wrong return value */
4159 PyErr_SetString(PyExc_TypeError,
4160 "character mapping must return integer, None or unicode");
4161 Py_DECREF(x);
4162 goto onError;
4163 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004165 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 }
4168 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004169 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 Py_XDECREF(errorHandler);
4172 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004174
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 Py_XDECREF(errorHandler);
4177 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 Py_XDECREF(v);
4179 return NULL;
4180}
4181
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004182/* Charmap encoding: the lookup table */
4183
4184struct encoding_map{
4185 PyObject_HEAD
4186 unsigned char level1[32];
4187 int count2, count3;
4188 unsigned char level23[1];
4189};
4190
4191static PyObject*
4192encoding_map_size(PyObject *obj, PyObject* args)
4193{
4194 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004195 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004196 128*map->count3);
4197}
4198
4199static PyMethodDef encoding_map_methods[] = {
4200 {"size", encoding_map_size, METH_NOARGS,
4201 PyDoc_STR("Return the size (in bytes) of this object") },
4202 { 0 }
4203};
4204
4205static void
4206encoding_map_dealloc(PyObject* o)
4207{
4208 PyObject_FREE(o);
4209}
4210
4211static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004212 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004213 "EncodingMap", /*tp_name*/
4214 sizeof(struct encoding_map), /*tp_basicsize*/
4215 0, /*tp_itemsize*/
4216 /* methods */
4217 encoding_map_dealloc, /*tp_dealloc*/
4218 0, /*tp_print*/
4219 0, /*tp_getattr*/
4220 0, /*tp_setattr*/
4221 0, /*tp_compare*/
4222 0, /*tp_repr*/
4223 0, /*tp_as_number*/
4224 0, /*tp_as_sequence*/
4225 0, /*tp_as_mapping*/
4226 0, /*tp_hash*/
4227 0, /*tp_call*/
4228 0, /*tp_str*/
4229 0, /*tp_getattro*/
4230 0, /*tp_setattro*/
4231 0, /*tp_as_buffer*/
4232 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4233 0, /*tp_doc*/
4234 0, /*tp_traverse*/
4235 0, /*tp_clear*/
4236 0, /*tp_richcompare*/
4237 0, /*tp_weaklistoffset*/
4238 0, /*tp_iter*/
4239 0, /*tp_iternext*/
4240 encoding_map_methods, /*tp_methods*/
4241 0, /*tp_members*/
4242 0, /*tp_getset*/
4243 0, /*tp_base*/
4244 0, /*tp_dict*/
4245 0, /*tp_descr_get*/
4246 0, /*tp_descr_set*/
4247 0, /*tp_dictoffset*/
4248 0, /*tp_init*/
4249 0, /*tp_alloc*/
4250 0, /*tp_new*/
4251 0, /*tp_free*/
4252 0, /*tp_is_gc*/
4253};
4254
4255PyObject*
4256PyUnicode_BuildEncodingMap(PyObject* string)
4257{
4258 Py_UNICODE *decode;
4259 PyObject *result;
4260 struct encoding_map *mresult;
4261 int i;
4262 int need_dict = 0;
4263 unsigned char level1[32];
4264 unsigned char level2[512];
4265 unsigned char *mlevel1, *mlevel2, *mlevel3;
4266 int count2 = 0, count3 = 0;
4267
4268 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4269 PyErr_BadArgument();
4270 return NULL;
4271 }
4272 decode = PyUnicode_AS_UNICODE(string);
4273 memset(level1, 0xFF, sizeof level1);
4274 memset(level2, 0xFF, sizeof level2);
4275
4276 /* If there isn't a one-to-one mapping of NULL to \0,
4277 or if there are non-BMP characters, we need to use
4278 a mapping dictionary. */
4279 if (decode[0] != 0)
4280 need_dict = 1;
4281 for (i = 1; i < 256; i++) {
4282 int l1, l2;
4283 if (decode[i] == 0
4284 #ifdef Py_UNICODE_WIDE
4285 || decode[i] > 0xFFFF
4286 #endif
4287 ) {
4288 need_dict = 1;
4289 break;
4290 }
4291 if (decode[i] == 0xFFFE)
4292 /* unmapped character */
4293 continue;
4294 l1 = decode[i] >> 11;
4295 l2 = decode[i] >> 7;
4296 if (level1[l1] == 0xFF)
4297 level1[l1] = count2++;
4298 if (level2[l2] == 0xFF)
4299 level2[l2] = count3++;
4300 }
4301
4302 if (count2 >= 0xFF || count3 >= 0xFF)
4303 need_dict = 1;
4304
4305 if (need_dict) {
4306 PyObject *result = PyDict_New();
4307 PyObject *key, *value;
4308 if (!result)
4309 return NULL;
4310 for (i = 0; i < 256; i++) {
4311 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004312 key = PyLong_FromLong(decode[i]);
4313 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004314 if (!key || !value)
4315 goto failed1;
4316 if (PyDict_SetItem(result, key, value) == -1)
4317 goto failed1;
4318 Py_DECREF(key);
4319 Py_DECREF(value);
4320 }
4321 return result;
4322 failed1:
4323 Py_XDECREF(key);
4324 Py_XDECREF(value);
4325 Py_DECREF(result);
4326 return NULL;
4327 }
4328
4329 /* Create a three-level trie */
4330 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4331 16*count2 + 128*count3 - 1);
4332 if (!result)
4333 return PyErr_NoMemory();
4334 PyObject_Init(result, &EncodingMapType);
4335 mresult = (struct encoding_map*)result;
4336 mresult->count2 = count2;
4337 mresult->count3 = count3;
4338 mlevel1 = mresult->level1;
4339 mlevel2 = mresult->level23;
4340 mlevel3 = mresult->level23 + 16*count2;
4341 memcpy(mlevel1, level1, 32);
4342 memset(mlevel2, 0xFF, 16*count2);
4343 memset(mlevel3, 0, 128*count3);
4344 count3 = 0;
4345 for (i = 1; i < 256; i++) {
4346 int o1, o2, o3, i2, i3;
4347 if (decode[i] == 0xFFFE)
4348 /* unmapped character */
4349 continue;
4350 o1 = decode[i]>>11;
4351 o2 = (decode[i]>>7) & 0xF;
4352 i2 = 16*mlevel1[o1] + o2;
4353 if (mlevel2[i2] == 0xFF)
4354 mlevel2[i2] = count3++;
4355 o3 = decode[i] & 0x7F;
4356 i3 = 128*mlevel2[i2] + o3;
4357 mlevel3[i3] = i;
4358 }
4359 return result;
4360}
4361
4362static int
4363encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4364{
4365 struct encoding_map *map = (struct encoding_map*)mapping;
4366 int l1 = c>>11;
4367 int l2 = (c>>7) & 0xF;
4368 int l3 = c & 0x7F;
4369 int i;
4370
4371#ifdef Py_UNICODE_WIDE
4372 if (c > 0xFFFF) {
4373 return -1;
4374 }
4375#endif
4376 if (c == 0)
4377 return 0;
4378 /* level 1*/
4379 i = map->level1[l1];
4380 if (i == 0xFF) {
4381 return -1;
4382 }
4383 /* level 2*/
4384 i = map->level23[16*i+l2];
4385 if (i == 0xFF) {
4386 return -1;
4387 }
4388 /* level 3 */
4389 i = map->level23[16*map->count2 + 128*i + l3];
4390 if (i == 0) {
4391 return -1;
4392 }
4393 return i;
4394}
4395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396/* Lookup the character ch in the mapping. If the character
4397 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004398 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400{
Christian Heimes217cfd12007-12-02 14:31:20 +00004401 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 PyObject *x;
4403
4404 if (w == NULL)
4405 return NULL;
4406 x = PyObject_GetItem(mapping, w);
4407 Py_DECREF(w);
4408 if (x == NULL) {
4409 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4410 /* No mapping found means: mapping is undefined. */
4411 PyErr_Clear();
4412 x = Py_None;
4413 Py_INCREF(x);
4414 return x;
4415 } else
4416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004418 else if (x == Py_None)
4419 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004420 else if (PyLong_Check(x)) {
4421 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 if (value < 0 || value > 255) {
4423 PyErr_SetString(PyExc_TypeError,
4424 "character mapping must be in range(256)");
4425 Py_DECREF(x);
4426 return NULL;
4427 }
4428 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 else if (PyString_Check(x))
4431 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004434 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004435 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004436 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 Py_DECREF(x);
4438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 }
4440}
4441
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004442static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004443charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004444{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004445 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004446 /* exponentially overallocate to minimize reallocations */
4447 if (requiredsize < 2*outsize)
4448 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004449 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004450 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004451 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004452}
4453
4454typedef enum charmapencode_result {
4455 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4456}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004458 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 space is available. Return a new reference to the object that
4460 was put in the output buffer, or Py_None, if the mapping was undefined
4461 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004462 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004464charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004465 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004467 PyObject *rep;
4468 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004469 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470
Christian Heimes90aa7642007-12-19 02:45:37 +00004471 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004472 int res = encoding_map_lookup(c, mapping);
4473 Py_ssize_t requiredsize = *outpos+1;
4474 if (res == -1)
4475 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004476 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004477 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004478 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004479 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004480 outstart[(*outpos)++] = (char)res;
4481 return enc_SUCCESS;
4482 }
4483
4484 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004486 return enc_EXCEPTION;
4487 else if (rep==Py_None) {
4488 Py_DECREF(rep);
4489 return enc_FAILED;
4490 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004491 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004492 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004493 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004494 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004496 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004498 outstart = PyString_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004499 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500 }
4501 else {
4502 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4504 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004505 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004506 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004508 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004510 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 memcpy(outstart + *outpos, repchars, repsize);
4512 *outpos += repsize;
4513 }
4514 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004515 Py_DECREF(rep);
4516 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517}
4518
4519/* handle an error in PyUnicode_EncodeCharmap
4520 Return 0 on success, -1 on error */
4521static
4522int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004523 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004525 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004526 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527{
4528 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004529 Py_ssize_t repsize;
4530 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 Py_UNICODE *uni2;
4532 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004533 Py_ssize_t collstartpos = *inpos;
4534 Py_ssize_t collendpos = *inpos+1;
4535 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 char *encoding = "charmap";
4537 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004538 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 /* find all unencodable characters */
4541 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004542 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004543 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004544 int res = encoding_map_lookup(p[collendpos], mapping);
4545 if (res != -1)
4546 break;
4547 ++collendpos;
4548 continue;
4549 }
4550
4551 rep = charmapencode_lookup(p[collendpos], mapping);
4552 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004554 else if (rep!=Py_None) {
4555 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556 break;
4557 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004558 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 ++collendpos;
4560 }
4561 /* cache callback name lookup
4562 * (if not done yet, i.e. it's the first error) */
4563 if (*known_errorHandler==-1) {
4564 if ((errors==NULL) || (!strcmp(errors, "strict")))
4565 *known_errorHandler = 1;
4566 else if (!strcmp(errors, "replace"))
4567 *known_errorHandler = 2;
4568 else if (!strcmp(errors, "ignore"))
4569 *known_errorHandler = 3;
4570 else if (!strcmp(errors, "xmlcharrefreplace"))
4571 *known_errorHandler = 4;
4572 else
4573 *known_errorHandler = 0;
4574 }
4575 switch (*known_errorHandler) {
4576 case 1: /* strict */
4577 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4578 return -1;
4579 case 2: /* replace */
4580 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4581 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004582 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 return -1;
4584 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004585 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4587 return -1;
4588 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 }
4590 /* fall through */
4591 case 3: /* ignore */
4592 *inpos = collendpos;
4593 break;
4594 case 4: /* xmlcharrefreplace */
4595 /* generate replacement (temporarily (mis)uses p) */
4596 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4597 char buffer[2+29+1+1];
4598 char *cp;
4599 sprintf(buffer, "&#%d;", (int)p[collpos]);
4600 for (cp = buffer; *cp; ++cp) {
4601 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004602 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004604 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4606 return -1;
4607 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 }
4609 }
4610 *inpos = collendpos;
4611 break;
4612 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004613 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 encoding, reason, p, size, exceptionObject,
4615 collstartpos, collendpos, &newpos);
4616 if (repunicode == NULL)
4617 return -1;
4618 /* generate replacement */
4619 repsize = PyUnicode_GET_SIZE(repunicode);
4620 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4621 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004622 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 return -1;
4624 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004625 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4628 return -1;
4629 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630 }
4631 *inpos = newpos;
4632 Py_DECREF(repunicode);
4633 }
4634 return 0;
4635}
4636
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639 PyObject *mapping,
4640 const char *errors)
4641{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 /* output object */
4643 PyObject *res = NULL;
4644 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004645 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004647 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 PyObject *errorHandler = NULL;
4649 PyObject *exc = NULL;
4650 /* the following variable is used for caching string comparisons
4651 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4652 * 3=ignore, 4=xmlcharrefreplace */
4653 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654
4655 /* Default to Latin-1 */
4656 if (mapping == NULL)
4657 return PyUnicode_EncodeLatin1(p, size, errors);
4658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 /* allocate enough for a simple encoding without
4660 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004661 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 if (res == NULL)
4663 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004664 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004667 while (inpos<size) {
4668 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004669 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004670 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004672 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 if (charmap_encoding_error(p, size, &inpos, mapping,
4674 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004675 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004676 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004677 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 else
4681 /* done with this character => adjust input position */
4682 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004686 if (respos<PyString_GET_SIZE(res))
4687 _PyString_Resize(&res, respos);
4688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 Py_XDECREF(exc);
4690 Py_XDECREF(errorHandler);
4691 return res;
4692
4693 onError:
4694 Py_XDECREF(res);
4695 Py_XDECREF(exc);
4696 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 return NULL;
4698}
4699
4700PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4701 PyObject *mapping)
4702{
4703 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4704 PyErr_BadArgument();
4705 return NULL;
4706 }
4707 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4708 PyUnicode_GET_SIZE(unicode),
4709 mapping,
4710 NULL);
4711}
4712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004713/* create or adjust a UnicodeTranslateError */
4714static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004715 const Py_UNICODE *unicode, Py_ssize_t size,
4716 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 if (*exceptionObject == NULL) {
4720 *exceptionObject = PyUnicodeTranslateError_Create(
4721 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 }
4723 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004724 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4725 goto onError;
4726 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4727 goto onError;
4728 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4729 goto onError;
4730 return;
4731 onError:
4732 Py_DECREF(*exceptionObject);
4733 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 }
4735}
4736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737/* raises a UnicodeTranslateError */
4738static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004739 const Py_UNICODE *unicode, Py_ssize_t size,
4740 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 const char *reason)
4742{
4743 make_translate_exception(exceptionObject,
4744 unicode, size, startpos, endpos, reason);
4745 if (*exceptionObject != NULL)
4746 PyCodec_StrictErrors(*exceptionObject);
4747}
4748
4749/* error handling callback helper:
4750 build arguments, call the callback and check the arguments,
4751 put the result into newpos and return the replacement string, which
4752 has to be freed by the caller */
4753static PyObject *unicode_translate_call_errorhandler(const char *errors,
4754 PyObject **errorHandler,
4755 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004756 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4757 Py_ssize_t startpos, Py_ssize_t endpos,
4758 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004759{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004760 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004762 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 PyObject *restuple;
4764 PyObject *resunicode;
4765
4766 if (*errorHandler == NULL) {
4767 *errorHandler = PyCodec_LookupError(errors);
4768 if (*errorHandler == NULL)
4769 return NULL;
4770 }
4771
4772 make_translate_exception(exceptionObject,
4773 unicode, size, startpos, endpos, reason);
4774 if (*exceptionObject == NULL)
4775 return NULL;
4776
4777 restuple = PyObject_CallFunctionObjArgs(
4778 *errorHandler, *exceptionObject, NULL);
4779 if (restuple == NULL)
4780 return NULL;
4781 if (!PyTuple_Check(restuple)) {
4782 PyErr_Format(PyExc_TypeError, &argparse[4]);
4783 Py_DECREF(restuple);
4784 return NULL;
4785 }
4786 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004787 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004788 Py_DECREF(restuple);
4789 return NULL;
4790 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 if (i_newpos<0)
4792 *newpos = size+i_newpos;
4793 else
4794 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004795 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004796 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004797 Py_DECREF(restuple);
4798 return NULL;
4799 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800 Py_INCREF(resunicode);
4801 Py_DECREF(restuple);
4802 return resunicode;
4803}
4804
4805/* Lookup the character ch in the mapping and put the result in result,
4806 which must be decrefed by the caller.
4807 Return 0 on success, -1 on error */
4808static
4809int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4810{
Christian Heimes217cfd12007-12-02 14:31:20 +00004811 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 PyObject *x;
4813
4814 if (w == NULL)
4815 return -1;
4816 x = PyObject_GetItem(mapping, w);
4817 Py_DECREF(w);
4818 if (x == NULL) {
4819 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4820 /* No mapping found means: use 1:1 mapping. */
4821 PyErr_Clear();
4822 *result = NULL;
4823 return 0;
4824 } else
4825 return -1;
4826 }
4827 else if (x == Py_None) {
4828 *result = x;
4829 return 0;
4830 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004831 else if (PyLong_Check(x)) {
4832 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 long max = PyUnicode_GetMax();
4834 if (value < 0 || value > max) {
4835 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004836 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 Py_DECREF(x);
4838 return -1;
4839 }
4840 *result = x;
4841 return 0;
4842 }
4843 else if (PyUnicode_Check(x)) {
4844 *result = x;
4845 return 0;
4846 }
4847 else {
4848 /* wrong return value */
4849 PyErr_SetString(PyExc_TypeError,
4850 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004851 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852 return -1;
4853 }
4854}
4855/* ensure that *outobj is at least requiredsize characters long,
4856if not reallocate and adjust various state variables.
4857Return 0 on success, -1 on error */
4858static
Walter Dörwald4894c302003-10-24 14:25:28 +00004859int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004860 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004862 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004863 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004865 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004867 if (requiredsize < 2 * oldsize)
4868 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004869 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 return -1;
4871 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 }
4873 return 0;
4874}
4875/* lookup the character, put the result in the output string and adjust
4876 various state variables. Return a new reference to the object that
4877 was put in the output buffer in *result, or Py_None, if the mapping was
4878 undefined (in which case no character was written).
4879 The called must decref result.
4880 Return 0 on success, -1 on error. */
4881static
Walter Dörwald4894c302003-10-24 14:25:28 +00004882int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004883 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004884 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885{
Walter Dörwald4894c302003-10-24 14:25:28 +00004886 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 return -1;
4888 if (*res==NULL) {
4889 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004890 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 }
4892 else if (*res==Py_None)
4893 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00004894 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00004896 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 }
4898 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 if (repsize==1) {
4901 /* no overflow check, because we know that the space is enough */
4902 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4903 }
4904 else if (repsize!=0) {
4905 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004906 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004907 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004908 repsize - 1;
4909 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 return -1;
4911 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4912 *outp += repsize;
4913 }
4914 }
4915 else
4916 return -1;
4917 return 0;
4918}
4919
4920PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004921 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 PyObject *mapping,
4923 const char *errors)
4924{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 /* output object */
4926 PyObject *res = NULL;
4927 /* pointers to the beginning and end+1 of input */
4928 const Py_UNICODE *startp = p;
4929 const Py_UNICODE *endp = p + size;
4930 /* pointer into the output */
4931 Py_UNICODE *str;
4932 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004933 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 char *reason = "character maps to <undefined>";
4935 PyObject *errorHandler = NULL;
4936 PyObject *exc = NULL;
4937 /* the following variable is used for caching string comparisons
4938 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4939 * 3=ignore, 4=xmlcharrefreplace */
4940 int known_errorHandler = -1;
4941
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942 if (mapping == NULL) {
4943 PyErr_BadArgument();
4944 return NULL;
4945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946
4947 /* allocate enough for a simple 1:1 translation without
4948 replacements, if we need more, we'll resize */
4949 res = PyUnicode_FromUnicode(NULL, size);
4950 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004951 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953 return res;
4954 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956 while (p<endp) {
4957 /* try to encode it */
4958 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004959 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004960 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 goto onError;
4962 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004963 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 if (x!=Py_None) /* it worked => adjust input pointer */
4965 ++p;
4966 else { /* untranslatable character */
4967 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004968 Py_ssize_t repsize;
4969 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 Py_UNICODE *uni2;
4971 /* startpos for collecting untranslatable chars */
4972 const Py_UNICODE *collstart = p;
4973 const Py_UNICODE *collend = p+1;
4974 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 /* find all untranslatable characters */
4977 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004978 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979 goto onError;
4980 Py_XDECREF(x);
4981 if (x!=Py_None)
4982 break;
4983 ++collend;
4984 }
4985 /* cache callback name lookup
4986 * (if not done yet, i.e. it's the first error) */
4987 if (known_errorHandler==-1) {
4988 if ((errors==NULL) || (!strcmp(errors, "strict")))
4989 known_errorHandler = 1;
4990 else if (!strcmp(errors, "replace"))
4991 known_errorHandler = 2;
4992 else if (!strcmp(errors, "ignore"))
4993 known_errorHandler = 3;
4994 else if (!strcmp(errors, "xmlcharrefreplace"))
4995 known_errorHandler = 4;
4996 else
4997 known_errorHandler = 0;
4998 }
4999 switch (known_errorHandler) {
5000 case 1: /* strict */
5001 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5002 goto onError;
5003 case 2: /* replace */
5004 /* No need to check for space, this is a 1:1 replacement */
5005 for (coll = collstart; coll<collend; ++coll)
5006 *str++ = '?';
5007 /* fall through */
5008 case 3: /* ignore */
5009 p = collend;
5010 break;
5011 case 4: /* xmlcharrefreplace */
5012 /* generate replacement (temporarily (mis)uses p) */
5013 for (p = collstart; p < collend; ++p) {
5014 char buffer[2+29+1+1];
5015 char *cp;
5016 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005017 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005018 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5019 goto onError;
5020 for (cp = buffer; *cp; ++cp)
5021 *str++ = *cp;
5022 }
5023 p = collend;
5024 break;
5025 default:
5026 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5027 reason, startp, size, &exc,
5028 collstart-startp, collend-startp, &newpos);
5029 if (repunicode == NULL)
5030 goto onError;
5031 /* generate replacement */
5032 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005033 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005034 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5035 Py_DECREF(repunicode);
5036 goto onError;
5037 }
5038 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5039 *str++ = *uni2;
5040 p = startp + newpos;
5041 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 }
5043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 /* Resize if we allocated to much */
5046 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005047 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005048 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005049 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 }
5051 Py_XDECREF(exc);
5052 Py_XDECREF(errorHandler);
5053 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005055 onError:
5056 Py_XDECREF(res);
5057 Py_XDECREF(exc);
5058 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 return NULL;
5060}
5061
5062PyObject *PyUnicode_Translate(PyObject *str,
5063 PyObject *mapping,
5064 const char *errors)
5065{
5066 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005067
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068 str = PyUnicode_FromObject(str);
5069 if (str == NULL)
5070 goto onError;
5071 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5072 PyUnicode_GET_SIZE(str),
5073 mapping,
5074 errors);
5075 Py_DECREF(str);
5076 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005077
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 onError:
5079 Py_XDECREF(str);
5080 return NULL;
5081}
Tim Petersced69f82003-09-16 20:30:58 +00005082
Guido van Rossum9e896b32000-04-05 20:11:21 +00005083/* --- Decimal Encoder ---------------------------------------------------- */
5084
5085int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005086 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005087 char *output,
5088 const char *errors)
5089{
5090 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091 PyObject *errorHandler = NULL;
5092 PyObject *exc = NULL;
5093 const char *encoding = "decimal";
5094 const char *reason = "invalid decimal Unicode string";
5095 /* the following variable is used for caching string comparisons
5096 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5097 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005098
5099 if (output == NULL) {
5100 PyErr_BadArgument();
5101 return -1;
5102 }
5103
5104 p = s;
5105 end = s + length;
5106 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005108 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005110 Py_ssize_t repsize;
5111 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 Py_UNICODE *uni2;
5113 Py_UNICODE *collstart;
5114 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005115
Guido van Rossum9e896b32000-04-05 20:11:21 +00005116 if (Py_UNICODE_ISSPACE(ch)) {
5117 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005119 continue;
5120 }
5121 decimal = Py_UNICODE_TODECIMAL(ch);
5122 if (decimal >= 0) {
5123 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005124 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005125 continue;
5126 }
Guido van Rossumba477042000-04-06 18:18:10 +00005127 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005128 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005130 continue;
5131 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132 /* All other characters are considered unencodable */
5133 collstart = p;
5134 collend = p+1;
5135 while (collend < end) {
5136 if ((0 < *collend && *collend < 256) ||
5137 !Py_UNICODE_ISSPACE(*collend) ||
5138 Py_UNICODE_TODECIMAL(*collend))
5139 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141 /* cache callback name lookup
5142 * (if not done yet, i.e. it's the first error) */
5143 if (known_errorHandler==-1) {
5144 if ((errors==NULL) || (!strcmp(errors, "strict")))
5145 known_errorHandler = 1;
5146 else if (!strcmp(errors, "replace"))
5147 known_errorHandler = 2;
5148 else if (!strcmp(errors, "ignore"))
5149 known_errorHandler = 3;
5150 else if (!strcmp(errors, "xmlcharrefreplace"))
5151 known_errorHandler = 4;
5152 else
5153 known_errorHandler = 0;
5154 }
5155 switch (known_errorHandler) {
5156 case 1: /* strict */
5157 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5158 goto onError;
5159 case 2: /* replace */
5160 for (p = collstart; p < collend; ++p)
5161 *output++ = '?';
5162 /* fall through */
5163 case 3: /* ignore */
5164 p = collend;
5165 break;
5166 case 4: /* xmlcharrefreplace */
5167 /* generate replacement (temporarily (mis)uses p) */
5168 for (p = collstart; p < collend; ++p)
5169 output += sprintf(output, "&#%d;", (int)*p);
5170 p = collend;
5171 break;
5172 default:
5173 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5174 encoding, reason, s, length, &exc,
5175 collstart-s, collend-s, &newpos);
5176 if (repunicode == NULL)
5177 goto onError;
5178 /* generate replacement */
5179 repsize = PyUnicode_GET_SIZE(repunicode);
5180 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5181 Py_UNICODE ch = *uni2;
5182 if (Py_UNICODE_ISSPACE(ch))
5183 *output++ = ' ';
5184 else {
5185 decimal = Py_UNICODE_TODECIMAL(ch);
5186 if (decimal >= 0)
5187 *output++ = '0' + decimal;
5188 else if (0 < ch && ch < 256)
5189 *output++ = (char)ch;
5190 else {
5191 Py_DECREF(repunicode);
5192 raise_encode_exception(&exc, encoding,
5193 s, length, collstart-s, collend-s, reason);
5194 goto onError;
5195 }
5196 }
5197 }
5198 p = s + newpos;
5199 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005200 }
5201 }
5202 /* 0-terminate the output string */
5203 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 Py_XDECREF(exc);
5205 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005206 return 0;
5207
5208 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 Py_XDECREF(exc);
5210 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005211 return -1;
5212}
5213
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214/* --- Helpers ------------------------------------------------------------ */
5215
Eric Smith8c663262007-08-25 02:26:07 +00005216#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005217#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005218#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005219/* Include _ParseTupleFinds from find.h */
5220#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005221#include "stringlib/find.h"
5222#include "stringlib/partition.h"
5223
5224/* helper macro to fixup start/end slice values */
5225#define FIX_START_END(obj) \
5226 if (start < 0) \
5227 start += (obj)->length; \
5228 if (start < 0) \
5229 start = 0; \
5230 if (end > (obj)->length) \
5231 end = (obj)->length; \
5232 if (end < 0) \
5233 end += (obj)->length; \
5234 if (end < 0) \
5235 end = 0;
5236
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005238 PyObject *substr,
5239 Py_ssize_t start,
5240 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005242 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005243 PyUnicodeObject* str_obj;
5244 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005245
Thomas Wouters477c8d52006-05-27 19:21:47 +00005246 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5247 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005249 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5250 if (!sub_obj) {
5251 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 return -1;
5253 }
Tim Petersced69f82003-09-16 20:30:58 +00005254
Thomas Wouters477c8d52006-05-27 19:21:47 +00005255 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005256
Thomas Wouters477c8d52006-05-27 19:21:47 +00005257 result = stringlib_count(
5258 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5259 );
5260
5261 Py_DECREF(sub_obj);
5262 Py_DECREF(str_obj);
5263
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 return result;
5265}
5266
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005268 PyObject *sub,
5269 Py_ssize_t start,
5270 Py_ssize_t end,
5271 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005273 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005274
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005276 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005277 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005278 sub = PyUnicode_FromObject(sub);
5279 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005280 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005281 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 }
Tim Petersced69f82003-09-16 20:30:58 +00005283
Thomas Wouters477c8d52006-05-27 19:21:47 +00005284 if (direction > 0)
5285 result = stringlib_find_slice(
5286 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5287 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5288 start, end
5289 );
5290 else
5291 result = stringlib_rfind_slice(
5292 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5293 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5294 start, end
5295 );
5296
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005298 Py_DECREF(sub);
5299
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 return result;
5301}
5302
Tim Petersced69f82003-09-16 20:30:58 +00005303static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304int tailmatch(PyUnicodeObject *self,
5305 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005306 Py_ssize_t start,
5307 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 int direction)
5309{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 if (substring->length == 0)
5311 return 1;
5312
Thomas Wouters477c8d52006-05-27 19:21:47 +00005313 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314
5315 end -= substring->length;
5316 if (end < start)
5317 return 0;
5318
5319 if (direction > 0) {
5320 if (Py_UNICODE_MATCH(self, end, substring))
5321 return 1;
5322 } else {
5323 if (Py_UNICODE_MATCH(self, start, substring))
5324 return 1;
5325 }
5326
5327 return 0;
5328}
5329
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005332 Py_ssize_t start,
5333 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 int direction)
5335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005337
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 str = PyUnicode_FromObject(str);
5339 if (str == NULL)
5340 return -1;
5341 substr = PyUnicode_FromObject(substr);
5342 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005343 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 return -1;
5345 }
Tim Petersced69f82003-09-16 20:30:58 +00005346
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 result = tailmatch((PyUnicodeObject *)str,
5348 (PyUnicodeObject *)substr,
5349 start, end, direction);
5350 Py_DECREF(str);
5351 Py_DECREF(substr);
5352 return result;
5353}
5354
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355/* Apply fixfct filter to the Unicode object self and return a
5356 reference to the modified object */
5357
Tim Petersced69f82003-09-16 20:30:58 +00005358static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359PyObject *fixup(PyUnicodeObject *self,
5360 int (*fixfct)(PyUnicodeObject *s))
5361{
5362
5363 PyUnicodeObject *u;
5364
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005365 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 if (u == NULL)
5367 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005368
5369 Py_UNICODE_COPY(u->str, self->str, self->length);
5370
Tim Peters7a29bd52001-09-12 03:03:31 +00005371 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 /* fixfct should return TRUE if it modified the buffer. If
5373 FALSE, return a reference to the original buffer instead
5374 (to save space, not time) */
5375 Py_INCREF(self);
5376 Py_DECREF(u);
5377 return (PyObject*) self;
5378 }
5379 return (PyObject*) u;
5380}
5381
Tim Petersced69f82003-09-16 20:30:58 +00005382static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383int fixupper(PyUnicodeObject *self)
5384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 Py_UNICODE *s = self->str;
5387 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 while (len-- > 0) {
5390 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005391
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 ch = Py_UNICODE_TOUPPER(*s);
5393 if (ch != *s) {
5394 status = 1;
5395 *s = ch;
5396 }
5397 s++;
5398 }
5399
5400 return status;
5401}
5402
Tim Petersced69f82003-09-16 20:30:58 +00005403static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404int fixlower(PyUnicodeObject *self)
5405{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005406 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 Py_UNICODE *s = self->str;
5408 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 while (len-- > 0) {
5411 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 ch = Py_UNICODE_TOLOWER(*s);
5414 if (ch != *s) {
5415 status = 1;
5416 *s = ch;
5417 }
5418 s++;
5419 }
5420
5421 return status;
5422}
5423
Tim Petersced69f82003-09-16 20:30:58 +00005424static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425int fixswapcase(PyUnicodeObject *self)
5426{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005427 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 Py_UNICODE *s = self->str;
5429 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005430
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 while (len-- > 0) {
5432 if (Py_UNICODE_ISUPPER(*s)) {
5433 *s = Py_UNICODE_TOLOWER(*s);
5434 status = 1;
5435 } else if (Py_UNICODE_ISLOWER(*s)) {
5436 *s = Py_UNICODE_TOUPPER(*s);
5437 status = 1;
5438 }
5439 s++;
5440 }
5441
5442 return status;
5443}
5444
Tim Petersced69f82003-09-16 20:30:58 +00005445static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446int fixcapitalize(PyUnicodeObject *self)
5447{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005448 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005449 Py_UNICODE *s = self->str;
5450 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005451
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005452 if (len == 0)
5453 return 0;
5454 if (Py_UNICODE_ISLOWER(*s)) {
5455 *s = Py_UNICODE_TOUPPER(*s);
5456 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005458 s++;
5459 while (--len > 0) {
5460 if (Py_UNICODE_ISUPPER(*s)) {
5461 *s = Py_UNICODE_TOLOWER(*s);
5462 status = 1;
5463 }
5464 s++;
5465 }
5466 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467}
5468
5469static
5470int fixtitle(PyUnicodeObject *self)
5471{
5472 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5473 register Py_UNICODE *e;
5474 int previous_is_cased;
5475
5476 /* Shortcut for single character strings */
5477 if (PyUnicode_GET_SIZE(self) == 1) {
5478 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5479 if (*p != ch) {
5480 *p = ch;
5481 return 1;
5482 }
5483 else
5484 return 0;
5485 }
Tim Petersced69f82003-09-16 20:30:58 +00005486
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 e = p + PyUnicode_GET_SIZE(self);
5488 previous_is_cased = 0;
5489 for (; p < e; p++) {
5490 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005491
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 if (previous_is_cased)
5493 *p = Py_UNICODE_TOLOWER(ch);
5494 else
5495 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005496
5497 if (Py_UNICODE_ISLOWER(ch) ||
5498 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 Py_UNICODE_ISTITLE(ch))
5500 previous_is_cased = 1;
5501 else
5502 previous_is_cased = 0;
5503 }
5504 return 1;
5505}
5506
Tim Peters8ce9f162004-08-27 01:49:32 +00005507PyObject *
5508PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509{
Tim Peters8ce9f162004-08-27 01:49:32 +00005510 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005511 const Py_UNICODE blank = ' ';
5512 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005513 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005514 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005515 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5516 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005517 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5518 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005519 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005520 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005521 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
Tim Peters05eba1f2004-08-27 21:32:02 +00005523 fseq = PySequence_Fast(seq, "");
5524 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005525 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005526 }
5527
Tim Peters91879ab2004-08-27 22:35:44 +00005528 /* Grrrr. A codec may be invoked to convert str objects to
5529 * Unicode, and so it's possible to call back into Python code
5530 * during PyUnicode_FromObject(), and so it's possible for a sick
5531 * codec to change the size of fseq (if seq is a list). Therefore
5532 * we have to keep refetching the size -- can't assume seqlen
5533 * is invariant.
5534 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005535 seqlen = PySequence_Fast_GET_SIZE(fseq);
5536 /* If empty sequence, return u"". */
5537 if (seqlen == 0) {
5538 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5539 goto Done;
5540 }
5541 /* If singleton sequence with an exact Unicode, return that. */
5542 if (seqlen == 1) {
5543 item = PySequence_Fast_GET_ITEM(fseq, 0);
5544 if (PyUnicode_CheckExact(item)) {
5545 Py_INCREF(item);
5546 res = (PyUnicodeObject *)item;
5547 goto Done;
5548 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005549 }
5550
Tim Peters05eba1f2004-08-27 21:32:02 +00005551 /* At least two items to join, or one that isn't exact Unicode. */
5552 if (seqlen > 1) {
5553 /* Set up sep and seplen -- they're needed. */
5554 if (separator == NULL) {
5555 sep = &blank;
5556 seplen = 1;
5557 }
5558 else {
5559 internal_separator = PyUnicode_FromObject(separator);
5560 if (internal_separator == NULL)
5561 goto onError;
5562 sep = PyUnicode_AS_UNICODE(internal_separator);
5563 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005564 /* In case PyUnicode_FromObject() mutated seq. */
5565 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005566 }
5567 }
5568
5569 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005570 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005571 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005572 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005573 res_p = PyUnicode_AS_UNICODE(res);
5574 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005575
Tim Peters05eba1f2004-08-27 21:32:02 +00005576 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005577 Py_ssize_t itemlen;
5578 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005579
5580 item = PySequence_Fast_GET_ITEM(fseq, i);
5581 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005582 if (!PyUnicode_Check(item)) {
5583 PyErr_Format(PyExc_TypeError,
5584 "sequence item %zd: expected str instance,"
5585 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005586 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005587 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005588 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005589 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005590 if (item == NULL)
5591 goto onError;
5592 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005593
Tim Peters91879ab2004-08-27 22:35:44 +00005594 /* In case PyUnicode_FromObject() mutated seq. */
5595 seqlen = PySequence_Fast_GET_SIZE(fseq);
5596
Tim Peters8ce9f162004-08-27 01:49:32 +00005597 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005599 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005600 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005601 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005602 if (i < seqlen - 1) {
5603 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005604 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005605 goto Overflow;
5606 }
5607 if (new_res_used > res_alloc) {
5608 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005609 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005610 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005611 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005612 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005613 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005614 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005615 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005617 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005618 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005620
5621 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005622 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005623 res_p += itemlen;
5624 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005625 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005626 res_p += seplen;
5627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005629 res_used = new_res_used;
5630 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005631
Tim Peters05eba1f2004-08-27 21:32:02 +00005632 /* Shrink res to match the used area; this probably can't fail,
5633 * but it's cheap to check.
5634 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005635 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005636 goto onError;
5637
5638 Done:
5639 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005640 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 return (PyObject *)res;
5642
Tim Peters8ce9f162004-08-27 01:49:32 +00005643 Overflow:
5644 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005645 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005646 Py_DECREF(item);
5647 /* fall through */
5648
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005650 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005651 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005652 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 return NULL;
5654}
5655
Tim Petersced69f82003-09-16 20:30:58 +00005656static
5657PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005658 Py_ssize_t left,
5659 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 Py_UNICODE fill)
5661{
5662 PyUnicodeObject *u;
5663
5664 if (left < 0)
5665 left = 0;
5666 if (right < 0)
5667 right = 0;
5668
Tim Peters7a29bd52001-09-12 03:03:31 +00005669 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 Py_INCREF(self);
5671 return self;
5672 }
5673
5674 u = _PyUnicode_New(left + self->length + right);
5675 if (u) {
5676 if (left)
5677 Py_UNICODE_FILL(u->str, fill, left);
5678 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5679 if (right)
5680 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5681 }
5682
5683 return u;
5684}
5685
5686#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005687 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 if (!str) \
5689 goto onError; \
5690 if (PyList_Append(list, str)) { \
5691 Py_DECREF(str); \
5692 goto onError; \
5693 } \
5694 else \
5695 Py_DECREF(str);
5696
5697static
5698PyObject *split_whitespace(PyUnicodeObject *self,
5699 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005700 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005702 register Py_ssize_t i;
5703 register Py_ssize_t j;
5704 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005706 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707
5708 for (i = j = 0; i < len; ) {
5709 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005710 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 i++;
5712 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005713 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 i++;
5715 if (j < i) {
5716 if (maxcount-- <= 0)
5717 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005718 SPLIT_APPEND(buf, j, i);
5719 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 i++;
5721 j = i;
5722 }
5723 }
5724 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005725 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 }
5727 return list;
5728
5729 onError:
5730 Py_DECREF(list);
5731 return NULL;
5732}
5733
5734PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005735 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005737 register Py_ssize_t i;
5738 register Py_ssize_t j;
5739 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 PyObject *list;
5741 PyObject *str;
5742 Py_UNICODE *data;
5743
5744 string = PyUnicode_FromObject(string);
5745 if (string == NULL)
5746 return NULL;
5747 data = PyUnicode_AS_UNICODE(string);
5748 len = PyUnicode_GET_SIZE(string);
5749
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 list = PyList_New(0);
5751 if (!list)
5752 goto onError;
5753
5754 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005755 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005756
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005758 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
5761 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005762 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 if (i < len) {
5764 if (data[i] == '\r' && i + 1 < len &&
5765 data[i+1] == '\n')
5766 i += 2;
5767 else
5768 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005769 if (keepends)
5770 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 }
Guido van Rossum86662912000-04-11 15:38:46 +00005772 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 j = i;
5774 }
5775 if (j < len) {
5776 SPLIT_APPEND(data, j, len);
5777 }
5778
5779 Py_DECREF(string);
5780 return list;
5781
5782 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005783 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 Py_DECREF(string);
5785 return NULL;
5786}
5787
Tim Petersced69f82003-09-16 20:30:58 +00005788static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789PyObject *split_char(PyUnicodeObject *self,
5790 PyObject *list,
5791 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005794 register Py_ssize_t i;
5795 register Py_ssize_t j;
5796 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005798 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
5800 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005801 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 if (maxcount-- <= 0)
5803 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005804 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 i = j = i + 1;
5806 } else
5807 i++;
5808 }
5809 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005810 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 }
5812 return list;
5813
5814 onError:
5815 Py_DECREF(list);
5816 return NULL;
5817}
5818
Tim Petersced69f82003-09-16 20:30:58 +00005819static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820PyObject *split_substring(PyUnicodeObject *self,
5821 PyObject *list,
5822 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005823 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005825 register Py_ssize_t i;
5826 register Py_ssize_t j;
5827 Py_ssize_t len = self->length;
5828 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 PyObject *str;
5830
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005831 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 if (Py_UNICODE_MATCH(self, i, substring)) {
5833 if (maxcount-- <= 0)
5834 break;
5835 SPLIT_APPEND(self->str, j, i);
5836 i = j = i + sublen;
5837 } else
5838 i++;
5839 }
5840 if (j <= len) {
5841 SPLIT_APPEND(self->str, j, len);
5842 }
5843 return list;
5844
5845 onError:
5846 Py_DECREF(list);
5847 return NULL;
5848}
5849
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005850static
5851PyObject *rsplit_whitespace(PyUnicodeObject *self,
5852 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005853 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005854{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005855 register Py_ssize_t i;
5856 register Py_ssize_t j;
5857 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005858 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005859 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860
5861 for (i = j = len - 1; i >= 0; ) {
5862 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005863 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005864 i--;
5865 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005866 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005867 i--;
5868 if (j > i) {
5869 if (maxcount-- <= 0)
5870 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005871 SPLIT_APPEND(buf, i + 1, j + 1);
5872 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005873 i--;
5874 j = i;
5875 }
5876 }
5877 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005878 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005879 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005880 if (PyList_Reverse(list) < 0)
5881 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005882 return list;
5883
5884 onError:
5885 Py_DECREF(list);
5886 return NULL;
5887}
5888
5889static
5890PyObject *rsplit_char(PyUnicodeObject *self,
5891 PyObject *list,
5892 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005893 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005894{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005895 register Py_ssize_t i;
5896 register Py_ssize_t j;
5897 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005898 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005899 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005900
5901 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005902 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005903 if (maxcount-- <= 0)
5904 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005905 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005906 j = i = i - 1;
5907 } else
5908 i--;
5909 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005910 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005911 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005912 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005913 if (PyList_Reverse(list) < 0)
5914 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005915 return list;
5916
5917 onError:
5918 Py_DECREF(list);
5919 return NULL;
5920}
5921
5922static
5923PyObject *rsplit_substring(PyUnicodeObject *self,
5924 PyObject *list,
5925 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005926 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005927{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005928 register Py_ssize_t i;
5929 register Py_ssize_t j;
5930 Py_ssize_t len = self->length;
5931 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005932 PyObject *str;
5933
5934 for (i = len - sublen, j = len; i >= 0; ) {
5935 if (Py_UNICODE_MATCH(self, i, substring)) {
5936 if (maxcount-- <= 0)
5937 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005938 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005939 j = i;
5940 i -= sublen;
5941 } else
5942 i--;
5943 }
5944 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005945 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005946 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005947 if (PyList_Reverse(list) < 0)
5948 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005949 return list;
5950
5951 onError:
5952 Py_DECREF(list);
5953 return NULL;
5954}
5955
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956#undef SPLIT_APPEND
5957
5958static
5959PyObject *split(PyUnicodeObject *self,
5960 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005961 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962{
5963 PyObject *list;
5964
5965 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005966 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
5968 list = PyList_New(0);
5969 if (!list)
5970 return NULL;
5971
5972 if (substring == NULL)
5973 return split_whitespace(self,list,maxcount);
5974
5975 else if (substring->length == 1)
5976 return split_char(self,list,substring->str[0],maxcount);
5977
5978 else if (substring->length == 0) {
5979 Py_DECREF(list);
5980 PyErr_SetString(PyExc_ValueError, "empty separator");
5981 return NULL;
5982 }
5983 else
5984 return split_substring(self,list,substring,maxcount);
5985}
5986
Tim Petersced69f82003-09-16 20:30:58 +00005987static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005988PyObject *rsplit(PyUnicodeObject *self,
5989 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005990 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005991{
5992 PyObject *list;
5993
5994 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005995 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005996
5997 list = PyList_New(0);
5998 if (!list)
5999 return NULL;
6000
6001 if (substring == NULL)
6002 return rsplit_whitespace(self,list,maxcount);
6003
6004 else if (substring->length == 1)
6005 return rsplit_char(self,list,substring->str[0],maxcount);
6006
6007 else if (substring->length == 0) {
6008 Py_DECREF(list);
6009 PyErr_SetString(PyExc_ValueError, "empty separator");
6010 return NULL;
6011 }
6012 else
6013 return rsplit_substring(self,list,substring,maxcount);
6014}
6015
6016static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017PyObject *replace(PyUnicodeObject *self,
6018 PyUnicodeObject *str1,
6019 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006020 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021{
6022 PyUnicodeObject *u;
6023
6024 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006025 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026
Thomas Wouters477c8d52006-05-27 19:21:47 +00006027 if (str1->length == str2->length) {
6028 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006029 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006030 if (str1->length == 1) {
6031 /* replace characters */
6032 Py_UNICODE u1, u2;
6033 if (!findchar(self->str, self->length, str1->str[0]))
6034 goto nothing;
6035 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6036 if (!u)
6037 return NULL;
6038 Py_UNICODE_COPY(u->str, self->str, self->length);
6039 u1 = str1->str[0];
6040 u2 = str2->str[0];
6041 for (i = 0; i < u->length; i++)
6042 if (u->str[i] == u1) {
6043 if (--maxcount < 0)
6044 break;
6045 u->str[i] = u2;
6046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006048 i = fastsearch(
6049 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006051 if (i < 0)
6052 goto nothing;
6053 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6054 if (!u)
6055 return NULL;
6056 Py_UNICODE_COPY(u->str, self->str, self->length);
6057 while (i <= self->length - str1->length)
6058 if (Py_UNICODE_MATCH(self, i, str1)) {
6059 if (--maxcount < 0)
6060 break;
6061 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6062 i += str1->length;
6063 } else
6064 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006067
6068 Py_ssize_t n, i, j, e;
6069 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 Py_UNICODE *p;
6071
6072 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006073 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 if (n > maxcount)
6075 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006076 if (n == 0)
6077 goto nothing;
6078 /* new_size = self->length + n * (str2->length - str1->length)); */
6079 delta = (str2->length - str1->length);
6080 if (delta == 0) {
6081 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006083 product = n * (str2->length - str1->length);
6084 if ((product / (str2->length - str1->length)) != n) {
6085 PyErr_SetString(PyExc_OverflowError,
6086 "replace string is too long");
6087 return NULL;
6088 }
6089 new_size = self->length + product;
6090 if (new_size < 0) {
6091 PyErr_SetString(PyExc_OverflowError,
6092 "replace string is too long");
6093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
6095 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006096 u = _PyUnicode_New(new_size);
6097 if (!u)
6098 return NULL;
6099 i = 0;
6100 p = u->str;
6101 e = self->length - str1->length;
6102 if (str1->length > 0) {
6103 while (n-- > 0) {
6104 /* look for next match */
6105 j = i;
6106 while (j <= e) {
6107 if (Py_UNICODE_MATCH(self, j, str1))
6108 break;
6109 j++;
6110 }
6111 if (j > i) {
6112 if (j > e)
6113 break;
6114 /* copy unchanged part [i:j] */
6115 Py_UNICODE_COPY(p, self->str+i, j-i);
6116 p += j - i;
6117 }
6118 /* copy substitution string */
6119 if (str2->length > 0) {
6120 Py_UNICODE_COPY(p, str2->str, str2->length);
6121 p += str2->length;
6122 }
6123 i = j + str1->length;
6124 }
6125 if (i < self->length)
6126 /* copy tail [i:] */
6127 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6128 } else {
6129 /* interleave */
6130 while (n > 0) {
6131 Py_UNICODE_COPY(p, str2->str, str2->length);
6132 p += str2->length;
6133 if (--n <= 0)
6134 break;
6135 *p++ = self->str[i++];
6136 }
6137 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006141
6142nothing:
6143 /* nothing to replace; return original string (when possible) */
6144 if (PyUnicode_CheckExact(self)) {
6145 Py_INCREF(self);
6146 return (PyObject *) self;
6147 }
6148 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149}
6150
6151/* --- Unicode Object Methods --------------------------------------------- */
6152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006153PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154"S.title() -> unicode\n\
6155\n\
6156Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006157characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
6159static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006160unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 return fixup(self, fixtitle);
6163}
6164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006165PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166"S.capitalize() -> unicode\n\
6167\n\
6168Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006169have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170
6171static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006172unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 return fixup(self, fixcapitalize);
6175}
6176
6177#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006178PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179"S.capwords() -> unicode\n\
6180\n\
6181Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006182normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
6184static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006185unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186{
6187 PyObject *list;
6188 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006189 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 /* Split into words */
6192 list = split(self, NULL, -1);
6193 if (!list)
6194 return NULL;
6195
6196 /* Capitalize each word */
6197 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6198 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6199 fixcapitalize);
6200 if (item == NULL)
6201 goto onError;
6202 Py_DECREF(PyList_GET_ITEM(list, i));
6203 PyList_SET_ITEM(list, i, item);
6204 }
6205
6206 /* Join the words to form a new string */
6207 item = PyUnicode_Join(NULL, list);
6208
6209onError:
6210 Py_DECREF(list);
6211 return (PyObject *)item;
6212}
6213#endif
6214
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006215/* Argument converter. Coerces to a single unicode character */
6216
6217static int
6218convert_uc(PyObject *obj, void *addr)
6219{
6220 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6221 PyObject *uniobj;
6222 Py_UNICODE *unistr;
6223
6224 uniobj = PyUnicode_FromObject(obj);
6225 if (uniobj == NULL) {
6226 PyErr_SetString(PyExc_TypeError,
6227 "The fill character cannot be converted to Unicode");
6228 return 0;
6229 }
6230 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6231 PyErr_SetString(PyExc_TypeError,
6232 "The fill character must be exactly one character long");
6233 Py_DECREF(uniobj);
6234 return 0;
6235 }
6236 unistr = PyUnicode_AS_UNICODE(uniobj);
6237 *fillcharloc = unistr[0];
6238 Py_DECREF(uniobj);
6239 return 1;
6240}
6241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006242PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006243"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006245Return S centered in a Unicode string of length width. Padding is\n\
6246done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247
6248static PyObject *
6249unicode_center(PyUnicodeObject *self, PyObject *args)
6250{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006251 Py_ssize_t marg, left;
6252 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006253 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254
Thomas Woutersde017742006-02-16 19:34:37 +00006255 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 return NULL;
6257
Tim Peters7a29bd52001-09-12 03:03:31 +00006258 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 Py_INCREF(self);
6260 return (PyObject*) self;
6261 }
6262
6263 marg = width - self->length;
6264 left = marg / 2 + (marg & width & 1);
6265
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006266 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267}
6268
Marc-André Lemburge5034372000-08-08 08:04:29 +00006269#if 0
6270
6271/* This code should go into some future Unicode collation support
6272 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006273 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006274
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006275/* speedy UTF-16 code point order comparison */
6276/* gleaned from: */
6277/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6278
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006279static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006280{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006281 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006282 0, 0, 0, 0, 0, 0, 0, 0,
6283 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006284 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006285};
6286
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287static int
6288unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6289{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006290 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006291
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 Py_UNICODE *s1 = str1->str;
6293 Py_UNICODE *s2 = str2->str;
6294
6295 len1 = str1->length;
6296 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006297
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006299 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006300
6301 c1 = *s1++;
6302 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006303
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006304 if (c1 > (1<<11) * 26)
6305 c1 += utf16Fixup[c1>>11];
6306 if (c2 > (1<<11) * 26)
6307 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006308 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006309
6310 if (c1 != c2)
6311 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006312
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006313 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 }
6315
6316 return (len1 < len2) ? -1 : (len1 != len2);
6317}
6318
Marc-André Lemburge5034372000-08-08 08:04:29 +00006319#else
6320
6321static int
6322unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6323{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006324 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006325
6326 Py_UNICODE *s1 = str1->str;
6327 Py_UNICODE *s2 = str2->str;
6328
6329 len1 = str1->length;
6330 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006331
Marc-André Lemburge5034372000-08-08 08:04:29 +00006332 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006333 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006334
Fredrik Lundh45714e92001-06-26 16:39:36 +00006335 c1 = *s1++;
6336 c2 = *s2++;
6337
6338 if (c1 != c2)
6339 return (c1 < c2) ? -1 : 1;
6340
Marc-André Lemburge5034372000-08-08 08:04:29 +00006341 len1--; len2--;
6342 }
6343
6344 return (len1 < len2) ? -1 : (len1 != len2);
6345}
6346
6347#endif
6348
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349int PyUnicode_Compare(PyObject *left,
6350 PyObject *right)
6351{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006352 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6353 return unicode_compare((PyUnicodeObject *)left,
6354 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006355 PyErr_Format(PyExc_TypeError,
6356 "Can't compare %.100s and %.100s",
6357 left->ob_type->tp_name,
6358 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 return -1;
6360}
6361
Martin v. Löwis5b222132007-06-10 09:51:05 +00006362int
6363PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6364{
6365 int i;
6366 Py_UNICODE *id;
6367 assert(PyUnicode_Check(uni));
6368 id = PyUnicode_AS_UNICODE(uni);
6369 /* Compare Unicode string and source character set string */
6370 for (i = 0; id[i] && str[i]; i++)
6371 if (id[i] != str[i])
6372 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6373 if (id[i])
6374 return 1; /* uni is longer */
6375 if (str[i])
6376 return -1; /* str is longer */
6377 return 0;
6378}
6379
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006380PyObject *PyUnicode_RichCompare(PyObject *left,
6381 PyObject *right,
6382 int op)
6383{
6384 int result;
6385
6386 result = PyUnicode_Compare(left, right);
6387 if (result == -1 && PyErr_Occurred())
6388 goto onError;
6389
6390 /* Convert the return value to a Boolean */
6391 switch (op) {
6392 case Py_EQ:
6393 result = (result == 0);
6394 break;
6395 case Py_NE:
6396 result = (result != 0);
6397 break;
6398 case Py_LE:
6399 result = (result <= 0);
6400 break;
6401 case Py_GE:
6402 result = (result >= 0);
6403 break;
6404 case Py_LT:
6405 result = (result == -1);
6406 break;
6407 case Py_GT:
6408 result = (result == 1);
6409 break;
6410 }
6411 return PyBool_FromLong(result);
6412
6413 onError:
6414
6415 /* Standard case
6416
6417 Type errors mean that PyUnicode_FromObject() could not convert
6418 one of the arguments (usually the right hand side) to Unicode,
6419 ie. we can't handle the comparison request. However, it is
6420 possible that the other object knows a comparison method, which
6421 is why we return Py_NotImplemented to give the other object a
6422 chance.
6423
6424 */
6425 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6426 PyErr_Clear();
6427 Py_INCREF(Py_NotImplemented);
6428 return Py_NotImplemented;
6429 }
6430 if (op != Py_EQ && op != Py_NE)
6431 return NULL;
6432
6433 /* Equality comparison.
6434
6435 This is a special case: we silence any PyExc_UnicodeDecodeError
6436 and instead turn it into a PyErr_UnicodeWarning.
6437
6438 */
6439 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6440 return NULL;
6441 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006442 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6443 (op == Py_EQ) ?
6444 "Unicode equal comparison "
6445 "failed to convert both arguments to Unicode - "
6446 "interpreting them as being unequal"
6447 :
6448 "Unicode unequal comparison "
6449 "failed to convert both arguments to Unicode - "
6450 "interpreting them as being unequal",
6451 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006452 return NULL;
6453 result = (op == Py_NE);
6454 return PyBool_FromLong(result);
6455}
6456
Guido van Rossum403d68b2000-03-13 15:55:09 +00006457int PyUnicode_Contains(PyObject *container,
6458 PyObject *element)
6459{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006460 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006462
6463 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006464 sub = PyUnicode_FromObject(element);
6465 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006466 PyErr_Format(PyExc_TypeError,
6467 "'in <string>' requires string as left operand, not %s",
6468 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006469 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006470 }
6471
Thomas Wouters477c8d52006-05-27 19:21:47 +00006472 str = PyUnicode_FromObject(container);
6473 if (!str) {
6474 Py_DECREF(sub);
6475 return -1;
6476 }
6477
6478 result = stringlib_contains_obj(str, sub);
6479
6480 Py_DECREF(str);
6481 Py_DECREF(sub);
6482
Guido van Rossum403d68b2000-03-13 15:55:09 +00006483 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006484}
6485
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486/* Concat to string or Unicode object giving a new Unicode object. */
6487
6488PyObject *PyUnicode_Concat(PyObject *left,
6489 PyObject *right)
6490{
6491 PyUnicodeObject *u = NULL, *v = NULL, *w;
6492
6493 /* Coerce the two arguments */
6494 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6495 if (u == NULL)
6496 goto onError;
6497 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6498 if (v == NULL)
6499 goto onError;
6500
6501 /* Shortcuts */
6502 if (v == unicode_empty) {
6503 Py_DECREF(v);
6504 return (PyObject *)u;
6505 }
6506 if (u == unicode_empty) {
6507 Py_DECREF(u);
6508 return (PyObject *)v;
6509 }
6510
6511 /* Concat the two Unicode strings */
6512 w = _PyUnicode_New(u->length + v->length);
6513 if (w == NULL)
6514 goto onError;
6515 Py_UNICODE_COPY(w->str, u->str, u->length);
6516 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6517
6518 Py_DECREF(u);
6519 Py_DECREF(v);
6520 return (PyObject *)w;
6521
6522onError:
6523 Py_XDECREF(u);
6524 Py_XDECREF(v);
6525 return NULL;
6526}
6527
Walter Dörwald1ab83302007-05-18 17:15:44 +00006528void
6529PyUnicode_Append(PyObject **pleft, PyObject *right)
6530{
6531 PyObject *new;
6532 if (*pleft == NULL)
6533 return;
6534 if (right == NULL || !PyUnicode_Check(*pleft)) {
6535 Py_DECREF(*pleft);
6536 *pleft = NULL;
6537 return;
6538 }
6539 new = PyUnicode_Concat(*pleft, right);
6540 Py_DECREF(*pleft);
6541 *pleft = new;
6542}
6543
6544void
6545PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6546{
6547 PyUnicode_Append(pleft, right);
6548 Py_XDECREF(right);
6549}
6550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006551PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552"S.count(sub[, start[, end]]) -> int\n\
6553\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006554Return the number of non-overlapping occurrences of substring sub in\n\
6555Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006556interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557
6558static PyObject *
6559unicode_count(PyUnicodeObject *self, PyObject *args)
6560{
6561 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006562 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006563 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 PyObject *result;
6565
Guido van Rossumb8872e62000-05-09 14:14:27 +00006566 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6567 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 return NULL;
6569
6570 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006571 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 if (substring == NULL)
6573 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006574
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
Christian Heimes217cfd12007-12-02 14:31:20 +00006577 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578 stringlib_count(self->str + start, end - start,
6579 substring->str, substring->length)
6580 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
6582 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006583
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 return result;
6585}
6586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006587PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006588"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006590Encodes S using the codec registered for encoding. encoding defaults\n\
6591to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006592handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6594'xmlcharrefreplace' as well as any other name registered with\n\
6595codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
6597static PyObject *
6598unicode_encode(PyUnicodeObject *self, PyObject *args)
6599{
6600 char *encoding = NULL;
6601 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006602 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006603
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6605 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006606 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006607 if (v == NULL)
6608 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006609 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006610 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006611 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006612 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006613 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006614 Py_DECREF(v);
6615 return NULL;
6616 }
6617 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006618
6619 onError:
6620 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006621}
6622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006623PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624"S.expandtabs([tabsize]) -> unicode\n\
6625\n\
6626Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006627If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628
6629static PyObject*
6630unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6631{
6632 Py_UNICODE *e;
6633 Py_UNICODE *p;
6634 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006635 Py_UNICODE *qe;
6636 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 PyUnicodeObject *u;
6638 int tabsize = 8;
6639
6640 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6641 return NULL;
6642
Thomas Wouters7e474022000-07-16 12:04:32 +00006643 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006644 i = 0; /* chars up to and including most recent \n or \r */
6645 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6646 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 for (p = self->str; p < e; p++)
6648 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006649 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006650 incr = tabsize - (j % tabsize); /* cannot overflow */
6651 if (j > PY_SSIZE_T_MAX - incr)
6652 goto overflow1;
6653 j += incr;
6654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 }
6656 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006657 if (j > PY_SSIZE_T_MAX - 1)
6658 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 j++;
6660 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006661 if (i > PY_SSIZE_T_MAX - j)
6662 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006664 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 }
6666 }
6667
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006668 if (i > PY_SSIZE_T_MAX - j)
6669 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 /* Second pass: create output string and fill it */
6672 u = _PyUnicode_New(i + j);
6673 if (!u)
6674 return NULL;
6675
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006676 j = 0; /* same as in first pass */
6677 q = u->str; /* next output char */
6678 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679
6680 for (p = self->str; p < e; p++)
6681 if (*p == '\t') {
6682 if (tabsize > 0) {
6683 i = tabsize - (j % tabsize);
6684 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006685 while (i--) {
6686 if (q >= qe)
6687 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 }
6691 }
6692 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006693 if (q >= qe)
6694 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006696 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 if (*p == '\n' || *p == '\r')
6698 j = 0;
6699 }
6700
6701 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006702
6703 overflow2:
6704 Py_DECREF(u);
6705 overflow1:
6706 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708}
6709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006710PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711"S.find(sub [,start [,end]]) -> int\n\
6712\n\
6713Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006714such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715arguments start and end are interpreted as in slice notation.\n\
6716\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006717Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718
6719static PyObject *
6720unicode_find(PyUnicodeObject *self, PyObject *args)
6721{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006722 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006723 Py_ssize_t start;
6724 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006725 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726
Christian Heimes9cd17752007-11-18 19:35:23 +00006727 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
Thomas Wouters477c8d52006-05-27 19:21:47 +00006730 result = stringlib_find_slice(
6731 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6732 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6733 start, end
6734 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735
6736 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006737
Christian Heimes217cfd12007-12-02 14:31:20 +00006738 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739}
6740
6741static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006742unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743{
6744 if (index < 0 || index >= self->length) {
6745 PyErr_SetString(PyExc_IndexError, "string index out of range");
6746 return NULL;
6747 }
6748
6749 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6750}
6751
Guido van Rossumc2504932007-09-18 19:42:40 +00006752/* Believe it or not, this produces the same value for ASCII strings
6753 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006755unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
Guido van Rossumc2504932007-09-18 19:42:40 +00006757 Py_ssize_t len;
6758 Py_UNICODE *p;
6759 long x;
6760
6761 if (self->hash != -1)
6762 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006763 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006764 p = self->str;
6765 x = *p << 7;
6766 while (--len >= 0)
6767 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006768 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006769 if (x == -1)
6770 x = -2;
6771 self->hash = x;
6772 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773}
6774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006775PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776"S.index(sub [,start [,end]]) -> int\n\
6777\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006778Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779
6780static PyObject *
6781unicode_index(PyUnicodeObject *self, PyObject *args)
6782{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006783 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006784 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006785 Py_ssize_t start;
6786 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
Christian Heimes9cd17752007-11-18 19:35:23 +00006788 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790
Thomas Wouters477c8d52006-05-27 19:21:47 +00006791 result = stringlib_find_slice(
6792 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6793 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6794 start, end
6795 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796
6797 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006798
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 if (result < 0) {
6800 PyErr_SetString(PyExc_ValueError, "substring not found");
6801 return NULL;
6802 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006803
Christian Heimes217cfd12007-12-02 14:31:20 +00006804 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805}
6806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006807PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006808"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006810Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006811at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812
6813static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006814unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815{
6816 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6817 register const Py_UNICODE *e;
6818 int cased;
6819
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 /* Shortcut for single character strings */
6821 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006824 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006825 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006827
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 e = p + PyUnicode_GET_SIZE(self);
6829 cased = 0;
6830 for (; p < e; p++) {
6831 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006832
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006834 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 else if (!cased && Py_UNICODE_ISLOWER(ch))
6836 cased = 1;
6837 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006838 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839}
6840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006841PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006844Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006845at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846
6847static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006848unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849{
6850 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6851 register const Py_UNICODE *e;
6852 int cased;
6853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 /* Shortcut for single character strings */
6855 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006856 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006858 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006859 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006860 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006861
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 e = p + PyUnicode_GET_SIZE(self);
6863 cased = 0;
6864 for (; p < e; p++) {
6865 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006866
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006868 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 else if (!cased && Py_UNICODE_ISUPPER(ch))
6870 cased = 1;
6871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873}
6874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006878Return True if S is a titlecased string and there is at least one\n\
6879character in S, i.e. upper- and titlecase characters may only\n\
6880follow uncased characters and lowercase characters only cased ones.\n\
6881Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882
6883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006884unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885{
6886 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6887 register const Py_UNICODE *e;
6888 int cased, previous_is_cased;
6889
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 /* Shortcut for single character strings */
6891 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6893 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006895 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006896 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006897 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006898
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 e = p + PyUnicode_GET_SIZE(self);
6900 cased = 0;
6901 previous_is_cased = 0;
6902 for (; p < e; p++) {
6903 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006904
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6906 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006907 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 previous_is_cased = 1;
6909 cased = 1;
6910 }
6911 else if (Py_UNICODE_ISLOWER(ch)) {
6912 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006913 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 previous_is_cased = 1;
6915 cased = 1;
6916 }
6917 else
6918 previous_is_cased = 0;
6919 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006923PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006924"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006926Return True if all characters in S are whitespace\n\
6927and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928
6929static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006930unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931{
6932 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6933 register const Py_UNICODE *e;
6934
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 /* Shortcut for single character strings */
6936 if (PyUnicode_GET_SIZE(self) == 1 &&
6937 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006940 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006941 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006942 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006943
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 e = p + PyUnicode_GET_SIZE(self);
6945 for (; p < e; p++) {
6946 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006947 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006949 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950}
6951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006952PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006953"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006954\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006955Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006956and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006957
6958static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006959unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006960{
6961 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6962 register const Py_UNICODE *e;
6963
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006964 /* Shortcut for single character strings */
6965 if (PyUnicode_GET_SIZE(self) == 1 &&
6966 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006968
6969 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006970 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006971 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006972
6973 e = p + PyUnicode_GET_SIZE(self);
6974 for (; p < e; p++) {
6975 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006976 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006977 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006978 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006979}
6980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006981PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006982"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006983\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006984Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006986
6987static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006988unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006989{
6990 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6991 register const Py_UNICODE *e;
6992
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006993 /* Shortcut for single character strings */
6994 if (PyUnicode_GET_SIZE(self) == 1 &&
6995 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006996 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006997
6998 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006999 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007000 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007001
7002 e = p + PyUnicode_GET_SIZE(self);
7003 for (; p < e; p++) {
7004 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007005 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007006 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007007 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007008}
7009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007010PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007011"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007013Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007014False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015
7016static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007017unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018{
7019 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7020 register const Py_UNICODE *e;
7021
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 /* Shortcut for single character strings */
7023 if (PyUnicode_GET_SIZE(self) == 1 &&
7024 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007025 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007027 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007028 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007029 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007030
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 e = p + PyUnicode_GET_SIZE(self);
7032 for (; p < e; p++) {
7033 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007034 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007036 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037}
7038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007039PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007040"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007042Return True if all characters in S are digits\n\
7043and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044
7045static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007046unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047{
7048 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7049 register const Py_UNICODE *e;
7050
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 /* Shortcut for single character strings */
7052 if (PyUnicode_GET_SIZE(self) == 1 &&
7053 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007054 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007056 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007057 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007058 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007059
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 e = p + PyUnicode_GET_SIZE(self);
7061 for (; p < e; p++) {
7062 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007063 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007065 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066}
7067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007068PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007069"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007071Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007072False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073
7074static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007075unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076{
7077 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7078 register const Py_UNICODE *e;
7079
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 /* Shortcut for single character strings */
7081 if (PyUnicode_GET_SIZE(self) == 1 &&
7082 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007083 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007085 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007086 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007087 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007088
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 e = p + PyUnicode_GET_SIZE(self);
7090 for (; p < e; p++) {
7091 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007092 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007094 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095}
7096
Martin v. Löwis47383402007-08-15 07:32:56 +00007097int
7098PyUnicode_IsIdentifier(PyObject *self)
7099{
7100 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7101 register const Py_UNICODE *e;
7102
7103 /* Special case for empty strings */
7104 if (PyUnicode_GET_SIZE(self) == 0)
7105 return 0;
7106
7107 /* PEP 3131 says that the first character must be in
7108 XID_Start and subsequent characters in XID_Continue,
7109 and for the ASCII range, the 2.x rules apply (i.e
7110 start with letters and underscore, continue with
7111 letters, digits, underscore). However, given the current
7112 definition of XID_Start and XID_Continue, it is sufficient
7113 to check just for these, except that _ must be allowed
7114 as starting an identifier. */
7115 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7116 return 0;
7117
7118 e = p + PyUnicode_GET_SIZE(self);
7119 for (p++; p < e; p++) {
7120 if (!_PyUnicode_IsXidContinue(*p))
7121 return 0;
7122 }
7123 return 1;
7124}
7125
7126PyDoc_STRVAR(isidentifier__doc__,
7127"S.isidentifier() -> bool\n\
7128\n\
7129Return True if S is a valid identifier according\n\
7130to the language definition.");
7131
7132static PyObject*
7133unicode_isidentifier(PyObject *self)
7134{
7135 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7136}
7137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007138PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139"S.join(sequence) -> unicode\n\
7140\n\
7141Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007142sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143
7144static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007145unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007147 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148}
7149
Martin v. Löwis18e16552006-02-15 17:27:45 +00007150static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151unicode_length(PyUnicodeObject *self)
7152{
7153 return self->length;
7154}
7155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007156PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007157"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158\n\
7159Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007160done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161
7162static PyObject *
7163unicode_ljust(PyUnicodeObject *self, PyObject *args)
7164{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007165 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007166 Py_UNICODE fillchar = ' ';
7167
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007168 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 return NULL;
7170
Tim Peters7a29bd52001-09-12 03:03:31 +00007171 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 Py_INCREF(self);
7173 return (PyObject*) self;
7174 }
7175
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007176 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177}
7178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007179PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180"S.lower() -> unicode\n\
7181\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007182Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183
7184static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007185unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 return fixup(self, fixlower);
7188}
7189
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007190#define LEFTSTRIP 0
7191#define RIGHTSTRIP 1
7192#define BOTHSTRIP 2
7193
7194/* Arrays indexed by above */
7195static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7196
7197#define STRIPNAME(i) (stripformat[i]+3)
7198
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199/* externally visible for str.strip(unicode) */
7200PyObject *
7201_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7202{
7203 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007204 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007205 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007206 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7207 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007208
Thomas Wouters477c8d52006-05-27 19:21:47 +00007209 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7210
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007211 i = 0;
7212 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007213 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7214 i++;
7215 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007216 }
7217
7218 j = len;
7219 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007220 do {
7221 j--;
7222 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7223 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007224 }
7225
7226 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007227 Py_INCREF(self);
7228 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007229 }
7230 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007231 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007232}
7233
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234
7235static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007236do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007238 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007239 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007240
7241 i = 0;
7242 if (striptype != RIGHTSTRIP) {
7243 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7244 i++;
7245 }
7246 }
7247
7248 j = len;
7249 if (striptype != LEFTSTRIP) {
7250 do {
7251 j--;
7252 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7253 j++;
7254 }
7255
7256 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7257 Py_INCREF(self);
7258 return (PyObject*)self;
7259 }
7260 else
7261 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262}
7263
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007264
7265static PyObject *
7266do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7267{
7268 PyObject *sep = NULL;
7269
7270 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7271 return NULL;
7272
7273 if (sep != NULL && sep != Py_None) {
7274 if (PyUnicode_Check(sep))
7275 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007276 else {
7277 PyErr_Format(PyExc_TypeError,
7278 "%s arg must be None, unicode or str",
7279 STRIPNAME(striptype));
7280 return NULL;
7281 }
7282 }
7283
7284 return do_strip(self, striptype);
7285}
7286
7287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007288PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007289"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007290\n\
7291Return a copy of the string S with leading and trailing\n\
7292whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007293If chars is given and not None, remove characters in chars instead.\n\
7294If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007295
7296static PyObject *
7297unicode_strip(PyUnicodeObject *self, PyObject *args)
7298{
7299 if (PyTuple_GET_SIZE(args) == 0)
7300 return do_strip(self, BOTHSTRIP); /* Common case */
7301 else
7302 return do_argstrip(self, BOTHSTRIP, args);
7303}
7304
7305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007306PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007307"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007308\n\
7309Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007310If chars is given and not None, remove characters in chars instead.\n\
7311If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007312
7313static PyObject *
7314unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7315{
7316 if (PyTuple_GET_SIZE(args) == 0)
7317 return do_strip(self, LEFTSTRIP); /* Common case */
7318 else
7319 return do_argstrip(self, LEFTSTRIP, args);
7320}
7321
7322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007323PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007324"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007325\n\
7326Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007327If chars is given and not None, remove characters in chars instead.\n\
7328If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007329
7330static PyObject *
7331unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7332{
7333 if (PyTuple_GET_SIZE(args) == 0)
7334 return do_strip(self, RIGHTSTRIP); /* Common case */
7335 else
7336 return do_argstrip(self, RIGHTSTRIP, args);
7337}
7338
7339
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007341unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342{
7343 PyUnicodeObject *u;
7344 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007345 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007346 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347
7348 if (len < 0)
7349 len = 0;
7350
Tim Peters7a29bd52001-09-12 03:03:31 +00007351 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 /* no repeat, return original string */
7353 Py_INCREF(str);
7354 return (PyObject*) str;
7355 }
Tim Peters8f422462000-09-09 06:13:41 +00007356
7357 /* ensure # of chars needed doesn't overflow int and # of bytes
7358 * needed doesn't overflow size_t
7359 */
7360 nchars = len * str->length;
7361 if (len && nchars / len != str->length) {
7362 PyErr_SetString(PyExc_OverflowError,
7363 "repeated string is too long");
7364 return NULL;
7365 }
7366 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7367 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7368 PyErr_SetString(PyExc_OverflowError,
7369 "repeated string is too long");
7370 return NULL;
7371 }
7372 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373 if (!u)
7374 return NULL;
7375
7376 p = u->str;
7377
Thomas Wouters477c8d52006-05-27 19:21:47 +00007378 if (str->length == 1 && len > 0) {
7379 Py_UNICODE_FILL(p, str->str[0], len);
7380 } else {
7381 Py_ssize_t done = 0; /* number of characters copied this far */
7382 if (done < nchars) {
7383 Py_UNICODE_COPY(p, str->str, str->length);
7384 done = str->length;
7385 }
7386 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007387 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007388 Py_UNICODE_COPY(p+done, p, n);
7389 done += n;
7390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391 }
7392
7393 return (PyObject*) u;
7394}
7395
7396PyObject *PyUnicode_Replace(PyObject *obj,
7397 PyObject *subobj,
7398 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007399 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400{
7401 PyObject *self;
7402 PyObject *str1;
7403 PyObject *str2;
7404 PyObject *result;
7405
7406 self = PyUnicode_FromObject(obj);
7407 if (self == NULL)
7408 return NULL;
7409 str1 = PyUnicode_FromObject(subobj);
7410 if (str1 == NULL) {
7411 Py_DECREF(self);
7412 return NULL;
7413 }
7414 str2 = PyUnicode_FromObject(replobj);
7415 if (str2 == NULL) {
7416 Py_DECREF(self);
7417 Py_DECREF(str1);
7418 return NULL;
7419 }
Tim Petersced69f82003-09-16 20:30:58 +00007420 result = replace((PyUnicodeObject *)self,
7421 (PyUnicodeObject *)str1,
7422 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423 maxcount);
7424 Py_DECREF(self);
7425 Py_DECREF(str1);
7426 Py_DECREF(str2);
7427 return result;
7428}
7429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007430PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431"S.replace (old, new[, maxsplit]) -> unicode\n\
7432\n\
7433Return a copy of S with all occurrences of substring\n\
7434old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007435given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436
7437static PyObject*
7438unicode_replace(PyUnicodeObject *self, PyObject *args)
7439{
7440 PyUnicodeObject *str1;
7441 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007442 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 PyObject *result;
7444
Martin v. Löwis18e16552006-02-15 17:27:45 +00007445 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 return NULL;
7447 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7448 if (str1 == NULL)
7449 return NULL;
7450 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007451 if (str2 == NULL) {
7452 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455
7456 result = replace(self, str1, str2, maxcount);
7457
7458 Py_DECREF(str1);
7459 Py_DECREF(str2);
7460 return result;
7461}
7462
7463static
7464PyObject *unicode_repr(PyObject *unicode)
7465{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007466 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007467 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007468 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7469 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7470
7471 /* XXX(nnorwitz): rather than over-allocating, it would be
7472 better to choose a different scheme. Perhaps scan the
7473 first N-chars of the string and allocate based on that size.
7474 */
7475 /* Initial allocation is based on the longest-possible unichr
7476 escape.
7477
7478 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7479 unichr, so in this case it's the longest unichr escape. In
7480 narrow (UTF-16) builds this is five chars per source unichr
7481 since there are two unichrs in the surrogate pair, so in narrow
7482 (UTF-16) builds it's not the longest unichr escape.
7483
7484 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7485 so in the narrow (UTF-16) build case it's the longest unichr
7486 escape.
7487 */
7488
Walter Dörwald1ab83302007-05-18 17:15:44 +00007489 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007490 2 /* quotes */
7491#ifdef Py_UNICODE_WIDE
7492 + 10*size
7493#else
7494 + 6*size
7495#endif
7496 + 1);
7497 if (repr == NULL)
7498 return NULL;
7499
Walter Dörwald1ab83302007-05-18 17:15:44 +00007500 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007501
7502 /* Add quote */
7503 *p++ = (findchar(s, size, '\'') &&
7504 !findchar(s, size, '"')) ? '"' : '\'';
7505 while (size-- > 0) {
7506 Py_UNICODE ch = *s++;
7507
7508 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007509 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007510 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007511 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007512 continue;
7513 }
7514
7515#ifdef Py_UNICODE_WIDE
7516 /* Map 21-bit characters to '\U00xxxxxx' */
7517 else if (ch >= 0x10000) {
7518 *p++ = '\\';
7519 *p++ = 'U';
7520 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7521 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7522 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7523 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7524 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7525 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7526 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7527 *p++ = hexdigits[ch & 0x0000000F];
7528 continue;
7529 }
7530#else
7531 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7532 else if (ch >= 0xD800 && ch < 0xDC00) {
7533 Py_UNICODE ch2;
7534 Py_UCS4 ucs;
7535
7536 ch2 = *s++;
7537 size--;
7538 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7539 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7540 *p++ = '\\';
7541 *p++ = 'U';
7542 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7543 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7544 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7545 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7546 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7547 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7548 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7549 *p++ = hexdigits[ucs & 0x0000000F];
7550 continue;
7551 }
7552 /* Fall through: isolated surrogates are copied as-is */
7553 s--;
7554 size++;
7555 }
7556#endif
7557
7558 /* Map 16-bit characters to '\uxxxx' */
7559 if (ch >= 256) {
7560 *p++ = '\\';
7561 *p++ = 'u';
7562 *p++ = hexdigits[(ch >> 12) & 0x000F];
7563 *p++ = hexdigits[(ch >> 8) & 0x000F];
7564 *p++ = hexdigits[(ch >> 4) & 0x000F];
7565 *p++ = hexdigits[ch & 0x000F];
7566 }
7567
7568 /* Map special whitespace to '\t', \n', '\r' */
7569 else if (ch == '\t') {
7570 *p++ = '\\';
7571 *p++ = 't';
7572 }
7573 else if (ch == '\n') {
7574 *p++ = '\\';
7575 *p++ = 'n';
7576 }
7577 else if (ch == '\r') {
7578 *p++ = '\\';
7579 *p++ = 'r';
7580 }
7581
7582 /* Map non-printable US ASCII to '\xhh' */
7583 else if (ch < ' ' || ch >= 0x7F) {
7584 *p++ = '\\';
7585 *p++ = 'x';
7586 *p++ = hexdigits[(ch >> 4) & 0x000F];
7587 *p++ = hexdigits[ch & 0x000F];
7588 }
7589
7590 /* Copy everything else as-is */
7591 else
7592 *p++ = (char) ch;
7593 }
7594 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007595 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007596
7597 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007598 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007599 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600}
7601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007602PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603"S.rfind(sub [,start [,end]]) -> int\n\
7604\n\
7605Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007606such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607arguments start and end are interpreted as in slice notation.\n\
7608\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007609Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610
7611static PyObject *
7612unicode_rfind(PyUnicodeObject *self, PyObject *args)
7613{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007614 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007615 Py_ssize_t start;
7616 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007617 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618
Christian Heimes9cd17752007-11-18 19:35:23 +00007619 if (!_ParseTupleFinds(args, &substring, &start, &end))
7620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621
Thomas Wouters477c8d52006-05-27 19:21:47 +00007622 result = stringlib_rfind_slice(
7623 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7624 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7625 start, end
7626 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
7628 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007629
Christian Heimes217cfd12007-12-02 14:31:20 +00007630 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631}
7632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007633PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634"S.rindex(sub [,start [,end]]) -> int\n\
7635\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007636Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637
7638static PyObject *
7639unicode_rindex(PyUnicodeObject *self, PyObject *args)
7640{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007641 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007642 Py_ssize_t start;
7643 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007644 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645
Christian Heimes9cd17752007-11-18 19:35:23 +00007646 if (!_ParseTupleFinds(args, &substring, &start, &end))
7647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648
Thomas Wouters477c8d52006-05-27 19:21:47 +00007649 result = stringlib_rfind_slice(
7650 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7651 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7652 start, end
7653 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654
7655 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007656
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 if (result < 0) {
7658 PyErr_SetString(PyExc_ValueError, "substring not found");
7659 return NULL;
7660 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007661 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662}
7663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007664PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007665"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666\n\
7667Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007668done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
7670static PyObject *
7671unicode_rjust(PyUnicodeObject *self, PyObject *args)
7672{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007673 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007674 Py_UNICODE fillchar = ' ';
7675
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007676 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 return NULL;
7678
Tim Peters7a29bd52001-09-12 03:03:31 +00007679 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680 Py_INCREF(self);
7681 return (PyObject*) self;
7682 }
7683
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007684 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685}
7686
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687PyObject *PyUnicode_Split(PyObject *s,
7688 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007689 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690{
7691 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007692
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 s = PyUnicode_FromObject(s);
7694 if (s == NULL)
7695 return NULL;
7696 if (sep != NULL) {
7697 sep = PyUnicode_FromObject(sep);
7698 if (sep == NULL) {
7699 Py_DECREF(s);
7700 return NULL;
7701 }
7702 }
7703
7704 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7705
7706 Py_DECREF(s);
7707 Py_XDECREF(sep);
7708 return result;
7709}
7710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007711PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712"S.split([sep [,maxsplit]]) -> list of strings\n\
7713\n\
7714Return a list of the words in S, using sep as the\n\
7715delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007716splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007717any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
7719static PyObject*
7720unicode_split(PyUnicodeObject *self, PyObject *args)
7721{
7722 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007723 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724
Martin v. Löwis18e16552006-02-15 17:27:45 +00007725 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 return NULL;
7727
7728 if (substring == Py_None)
7729 return split(self, NULL, maxcount);
7730 else if (PyUnicode_Check(substring))
7731 return split(self, (PyUnicodeObject *)substring, maxcount);
7732 else
7733 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7734}
7735
Thomas Wouters477c8d52006-05-27 19:21:47 +00007736PyObject *
7737PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7738{
7739 PyObject* str_obj;
7740 PyObject* sep_obj;
7741 PyObject* out;
7742
7743 str_obj = PyUnicode_FromObject(str_in);
7744 if (!str_obj)
7745 return NULL;
7746 sep_obj = PyUnicode_FromObject(sep_in);
7747 if (!sep_obj) {
7748 Py_DECREF(str_obj);
7749 return NULL;
7750 }
7751
7752 out = stringlib_partition(
7753 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7754 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7755 );
7756
7757 Py_DECREF(sep_obj);
7758 Py_DECREF(str_obj);
7759
7760 return out;
7761}
7762
7763
7764PyObject *
7765PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7766{
7767 PyObject* str_obj;
7768 PyObject* sep_obj;
7769 PyObject* out;
7770
7771 str_obj = PyUnicode_FromObject(str_in);
7772 if (!str_obj)
7773 return NULL;
7774 sep_obj = PyUnicode_FromObject(sep_in);
7775 if (!sep_obj) {
7776 Py_DECREF(str_obj);
7777 return NULL;
7778 }
7779
7780 out = stringlib_rpartition(
7781 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7782 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7783 );
7784
7785 Py_DECREF(sep_obj);
7786 Py_DECREF(str_obj);
7787
7788 return out;
7789}
7790
7791PyDoc_STRVAR(partition__doc__,
7792"S.partition(sep) -> (head, sep, tail)\n\
7793\n\
7794Searches for the separator sep in S, and returns the part before it,\n\
7795the separator itself, and the part after it. If the separator is not\n\
7796found, returns S and two empty strings.");
7797
7798static PyObject*
7799unicode_partition(PyUnicodeObject *self, PyObject *separator)
7800{
7801 return PyUnicode_Partition((PyObject *)self, separator);
7802}
7803
7804PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007805"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007806\n\
7807Searches for the separator sep in S, starting at the end of S, and returns\n\
7808the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007809separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007810
7811static PyObject*
7812unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7813{
7814 return PyUnicode_RPartition((PyObject *)self, separator);
7815}
7816
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007817PyObject *PyUnicode_RSplit(PyObject *s,
7818 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007819 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007820{
7821 PyObject *result;
7822
7823 s = PyUnicode_FromObject(s);
7824 if (s == NULL)
7825 return NULL;
7826 if (sep != NULL) {
7827 sep = PyUnicode_FromObject(sep);
7828 if (sep == NULL) {
7829 Py_DECREF(s);
7830 return NULL;
7831 }
7832 }
7833
7834 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7835
7836 Py_DECREF(s);
7837 Py_XDECREF(sep);
7838 return result;
7839}
7840
7841PyDoc_STRVAR(rsplit__doc__,
7842"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7843\n\
7844Return a list of the words in S, using sep as the\n\
7845delimiter string, starting at the end of the string and\n\
7846working to the front. If maxsplit is given, at most maxsplit\n\
7847splits are done. If sep is not specified, any whitespace string\n\
7848is a separator.");
7849
7850static PyObject*
7851unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7852{
7853 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007854 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007855
Martin v. Löwis18e16552006-02-15 17:27:45 +00007856 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007857 return NULL;
7858
7859 if (substring == Py_None)
7860 return rsplit(self, NULL, maxcount);
7861 else if (PyUnicode_Check(substring))
7862 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7863 else
7864 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7865}
7866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007867PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007868"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869\n\
7870Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007871Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007872is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873
7874static PyObject*
7875unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7876{
Guido van Rossum86662912000-04-11 15:38:46 +00007877 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878
Guido van Rossum86662912000-04-11 15:38:46 +00007879 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 return NULL;
7881
Guido van Rossum86662912000-04-11 15:38:46 +00007882 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883}
7884
7885static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007886PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887{
Walter Dörwald346737f2007-05-31 10:44:43 +00007888 if (PyUnicode_CheckExact(self)) {
7889 Py_INCREF(self);
7890 return self;
7891 } else
7892 /* Subtype -- return genuine unicode string with the same value. */
7893 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7894 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895}
7896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007897PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898"S.swapcase() -> unicode\n\
7899\n\
7900Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007901and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902
7903static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007904unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906 return fixup(self, fixswapcase);
7907}
7908
Georg Brandlceee0772007-11-27 23:48:05 +00007909PyDoc_STRVAR(maketrans__doc__,
7910"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
7911\n\
7912Return a translation table usable for str.translate().\n\
7913If there is only one argument, it must be a dictionary mapping Unicode\n\
7914ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
7915Character keys will then be converted to ordinals.\n\
7916If there are two arguments, they must be strings of equal length, and\n\
7917in the resulting dictionary, each character in x will be mapped to the\n\
7918character at the same position in y. If there is a third argument, it\n\
7919must be a string, whose characters will be mapped to None in the result.");
7920
7921static PyObject*
7922unicode_maketrans(PyUnicodeObject *null, PyObject *args)
7923{
7924 PyObject *x, *y = NULL, *z = NULL;
7925 PyObject *new = NULL, *key, *value;
7926 Py_ssize_t i = 0;
7927 int res;
7928
7929 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
7930 return NULL;
7931 new = PyDict_New();
7932 if (!new)
7933 return NULL;
7934 if (y != NULL) {
7935 /* x must be a string too, of equal length */
7936 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
7937 if (!PyUnicode_Check(x)) {
7938 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
7939 "be a string if there is a second argument");
7940 goto err;
7941 }
7942 if (PyUnicode_GET_SIZE(x) != ylen) {
7943 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
7944 "arguments must have equal length");
7945 goto err;
7946 }
7947 /* create entries for translating chars in x to those in y */
7948 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007949 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
7950 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007951 if (!key || !value)
7952 goto err;
7953 res = PyDict_SetItem(new, key, value);
7954 Py_DECREF(key);
7955 Py_DECREF(value);
7956 if (res < 0)
7957 goto err;
7958 }
7959 /* create entries for deleting chars in z */
7960 if (z != NULL) {
7961 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007962 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007963 if (!key)
7964 goto err;
7965 res = PyDict_SetItem(new, key, Py_None);
7966 Py_DECREF(key);
7967 if (res < 0)
7968 goto err;
7969 }
7970 }
7971 } else {
7972 /* x must be a dict */
7973 if (!PyDict_Check(x)) {
7974 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
7975 "to maketrans it must be a dict");
7976 goto err;
7977 }
7978 /* copy entries into the new dict, converting string keys to int keys */
7979 while (PyDict_Next(x, &i, &key, &value)) {
7980 if (PyUnicode_Check(key)) {
7981 /* convert string keys to integer keys */
7982 PyObject *newkey;
7983 if (PyUnicode_GET_SIZE(key) != 1) {
7984 PyErr_SetString(PyExc_ValueError, "string keys in translate "
7985 "table must be of length 1");
7986 goto err;
7987 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007988 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00007989 if (!newkey)
7990 goto err;
7991 res = PyDict_SetItem(new, newkey, value);
7992 Py_DECREF(newkey);
7993 if (res < 0)
7994 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00007995 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00007996 /* just keep integer keys */
7997 if (PyDict_SetItem(new, key, value) < 0)
7998 goto err;
7999 } else {
8000 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8001 "be strings or integers");
8002 goto err;
8003 }
8004 }
8005 }
8006 return new;
8007 err:
8008 Py_DECREF(new);
8009 return NULL;
8010}
8011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008012PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013"S.translate(table) -> unicode\n\
8014\n\
8015Return a copy of the string S, where all characters have been mapped\n\
8016through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008017Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
8018Unmapped characters are left untouched. Characters mapped to None\n\
8019are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020
8021static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008022unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023{
Georg Brandlceee0772007-11-27 23:48:05 +00008024 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025}
8026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008027PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028"S.upper() -> unicode\n\
8029\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008030Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031
8032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008033unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 return fixup(self, fixupper);
8036}
8037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008038PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039"S.zfill(width) -> unicode\n\
8040\n\
8041Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008042of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043
8044static PyObject *
8045unicode_zfill(PyUnicodeObject *self, PyObject *args)
8046{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008047 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 PyUnicodeObject *u;
8049
Martin v. Löwis18e16552006-02-15 17:27:45 +00008050 Py_ssize_t width;
8051 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 return NULL;
8053
8054 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008055 if (PyUnicode_CheckExact(self)) {
8056 Py_INCREF(self);
8057 return (PyObject*) self;
8058 }
8059 else
8060 return PyUnicode_FromUnicode(
8061 PyUnicode_AS_UNICODE(self),
8062 PyUnicode_GET_SIZE(self)
8063 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 }
8065
8066 fill = width - self->length;
8067
8068 u = pad(self, fill, 0, '0');
8069
Walter Dörwald068325e2002-04-15 13:36:47 +00008070 if (u == NULL)
8071 return NULL;
8072
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 if (u->str[fill] == '+' || u->str[fill] == '-') {
8074 /* move sign to beginning of string */
8075 u->str[0] = u->str[fill];
8076 u->str[fill] = '0';
8077 }
8078
8079 return (PyObject*) u;
8080}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081
8082#if 0
8083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008084unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085{
Christian Heimes2202f872008-02-06 14:31:34 +00008086 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087}
8088#endif
8089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008090PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008091"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008093Return True if S starts with the specified prefix, False otherwise.\n\
8094With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008095With optional end, stop comparing S at that position.\n\
8096prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097
8098static PyObject *
8099unicode_startswith(PyUnicodeObject *self,
8100 PyObject *args)
8101{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008102 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008104 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008105 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008106 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008108 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008109 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008111 if (PyTuple_Check(subobj)) {
8112 Py_ssize_t i;
8113 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8114 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8115 PyTuple_GET_ITEM(subobj, i));
8116 if (substring == NULL)
8117 return NULL;
8118 result = tailmatch(self, substring, start, end, -1);
8119 Py_DECREF(substring);
8120 if (result) {
8121 Py_RETURN_TRUE;
8122 }
8123 }
8124 /* nothing matched */
8125 Py_RETURN_FALSE;
8126 }
8127 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008129 return NULL;
8130 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008132 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133}
8134
8135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008136PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008137"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008139Return True if S ends with the specified suffix, False otherwise.\n\
8140With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008141With optional end, stop comparing S at that position.\n\
8142suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143
8144static PyObject *
8145unicode_endswith(PyUnicodeObject *self,
8146 PyObject *args)
8147{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008148 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008150 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008151 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008152 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008154 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8155 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008157 if (PyTuple_Check(subobj)) {
8158 Py_ssize_t i;
8159 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8160 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8161 PyTuple_GET_ITEM(subobj, i));
8162 if (substring == NULL)
8163 return NULL;
8164 result = tailmatch(self, substring, start, end, +1);
8165 Py_DECREF(substring);
8166 if (result) {
8167 Py_RETURN_TRUE;
8168 }
8169 }
8170 Py_RETURN_FALSE;
8171 }
8172 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008176 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008178 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179}
8180
Eric Smith8c663262007-08-25 02:26:07 +00008181#include "stringlib/string_format.h"
8182
8183PyDoc_STRVAR(format__doc__,
8184"S.format(*args, **kwargs) -> unicode\n\
8185\n\
8186");
8187
Eric Smith8c663262007-08-25 02:26:07 +00008188PyDoc_STRVAR(p_format__doc__,
8189"S.__format__(format_spec) -> unicode\n\
8190\n\
8191");
8192
8193static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008194unicode_getnewargs(PyUnicodeObject *v)
8195{
8196 return Py_BuildValue("(u#)", v->str, v->length);
8197}
8198
8199
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200static PyMethodDef unicode_methods[] = {
8201
8202 /* Order is according to common usage: often used methods should
8203 appear first, since lookup is done sequentially. */
8204
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008205 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8206 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8207 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008208 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008209 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8210 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8211 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8212 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8213 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8214 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8215 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008216 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008217 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8218 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8219 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008220 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008221 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8222 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8223 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008224 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008225 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008226 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008227 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008228 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8229 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8230 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8231 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8232 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8233 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8234 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8235 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8236 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8237 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8238 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8239 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8240 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8241 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008242 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008243 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008244 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8245 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008246 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8247 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008248 {"maketrans", (PyCFunction) unicode_maketrans,
8249 METH_VARARGS | METH_STATIC, maketrans__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008250#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008251 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252#endif
8253
8254#if 0
8255 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008256 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257#endif
8258
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008259 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 {NULL, NULL}
8261};
8262
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008263static PyObject *
8264unicode_mod(PyObject *v, PyObject *w)
8265{
8266 if (!PyUnicode_Check(v)) {
8267 Py_INCREF(Py_NotImplemented);
8268 return Py_NotImplemented;
8269 }
8270 return PyUnicode_Format(v, w);
8271}
8272
8273static PyNumberMethods unicode_as_number = {
8274 0, /*nb_add*/
8275 0, /*nb_subtract*/
8276 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008277 unicode_mod, /*nb_remainder*/
8278};
8279
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008281 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008282 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008283 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8284 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008285 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 0, /* sq_ass_item */
8287 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008288 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289};
8290
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008291static PyObject*
8292unicode_subscript(PyUnicodeObject* self, PyObject* item)
8293{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008294 if (PyIndex_Check(item)) {
8295 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008296 if (i == -1 && PyErr_Occurred())
8297 return NULL;
8298 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008299 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008300 return unicode_getitem(self, i);
8301 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008302 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008303 Py_UNICODE* source_buf;
8304 Py_UNICODE* result_buf;
8305 PyObject* result;
8306
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008307 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008308 &start, &stop, &step, &slicelength) < 0) {
8309 return NULL;
8310 }
8311
8312 if (slicelength <= 0) {
8313 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008314 } else if (start == 0 && step == 1 && slicelength == self->length &&
8315 PyUnicode_CheckExact(self)) {
8316 Py_INCREF(self);
8317 return (PyObject *)self;
8318 } else if (step == 1) {
8319 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008320 } else {
8321 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008322 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8323 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008324
8325 if (result_buf == NULL)
8326 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008327
8328 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8329 result_buf[i] = source_buf[cur];
8330 }
Tim Petersced69f82003-09-16 20:30:58 +00008331
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008332 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008333 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008334 return result;
8335 }
8336 } else {
8337 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8338 return NULL;
8339 }
8340}
8341
8342static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008343 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008344 (binaryfunc)unicode_subscript, /* mp_subscript */
8345 (objobjargproc)0, /* mp_ass_subscript */
8346};
8347
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349/* Helpers for PyUnicode_Format() */
8350
8351static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008352getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008354 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 if (argidx < arglen) {
8356 (*p_argidx)++;
8357 if (arglen < 0)
8358 return args;
8359 else
8360 return PyTuple_GetItem(args, argidx);
8361 }
8362 PyErr_SetString(PyExc_TypeError,
8363 "not enough arguments for format string");
8364 return NULL;
8365}
8366
Martin v. Löwis18e16552006-02-15 17:27:45 +00008367static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008368strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008370 register Py_ssize_t i;
8371 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 for (i = len - 1; i >= 0; i--)
8373 buffer[i] = (Py_UNICODE) charbuffer[i];
8374
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 return len;
8376}
8377
Neal Norwitzfc76d632006-01-10 06:03:13 +00008378static int
8379doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8380{
Tim Peters15231542006-02-16 01:08:01 +00008381 Py_ssize_t result;
8382
Neal Norwitzfc76d632006-01-10 06:03:13 +00008383 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008384 result = strtounicode(buffer, (char *)buffer);
8385 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008386}
8387
Christian Heimes3fd13992008-03-21 01:05:49 +00008388#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008389static int
8390longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8391{
Tim Peters15231542006-02-16 01:08:01 +00008392 Py_ssize_t result;
8393
Neal Norwitzfc76d632006-01-10 06:03:13 +00008394 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008395 result = strtounicode(buffer, (char *)buffer);
8396 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008397}
Christian Heimes3fd13992008-03-21 01:05:49 +00008398#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008399
Guido van Rossum078151d2002-08-11 04:24:12 +00008400/* XXX To save some code duplication, formatfloat/long/int could have been
8401 shared with stringobject.c, converting from 8-bit to Unicode after the
8402 formatting is done. */
8403
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404static int
8405formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008406 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 int flags,
8408 int prec,
8409 int type,
8410 PyObject *v)
8411{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008412 /* fmt = '%#.' + `prec` + `type`
8413 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 char fmt[20];
8415 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008416
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 x = PyFloat_AsDouble(v);
8418 if (x == -1.0 && PyErr_Occurred())
8419 return -1;
8420 if (prec < 0)
8421 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8423 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008424 /* Worst case length calc to ensure no buffer overrun:
8425
8426 'g' formats:
8427 fmt = %#.<prec>g
8428 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8429 for any double rep.)
8430 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8431
8432 'f' formats:
8433 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8434 len = 1 + 50 + 1 + prec = 52 + prec
8435
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008436 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008437 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008438
8439 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008440 if (((type == 'g' || type == 'G') &&
8441 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008442 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008443 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008444 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008445 return -1;
8446 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008447 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8448 (flags&F_ALT) ? "#" : "",
8449 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008450 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451}
8452
Tim Peters38fd5b62000-09-21 05:43:11 +00008453static PyObject*
8454formatlong(PyObject *val, int flags, int prec, int type)
8455{
8456 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008457 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008458 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008459 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008460
8461 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8462 if (!str)
8463 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008464 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008465 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008466 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008467}
8468
Christian Heimes3fd13992008-03-21 01:05:49 +00008469#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470static int
8471formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008472 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 int flags,
8474 int prec,
8475 int type,
8476 PyObject *v)
8477{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008478 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008479 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8480 * + 1 + 1
8481 * = 24
8482 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008483 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008484 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 long x;
8486
Christian Heimes217cfd12007-12-02 14:31:20 +00008487 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008489 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008490 if (x < 0 && type == 'u') {
8491 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008492 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008493 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8494 sign = "-";
8495 else
8496 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008498 prec = 1;
8499
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008500 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8501 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008502 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008503 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008504 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008505 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008506 return -1;
8507 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008508
8509 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008510 (type == 'x' || type == 'X' || type == 'o')) {
8511 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008512 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008513 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008514 * - when 0 is being converted, the C standard leaves off
8515 * the '0x' or '0X', which is inconsistent with other
8516 * %#x/%#X conversions and inconsistent with Python's
8517 * hex() function
8518 * - there are platforms that violate the standard and
8519 * convert 0 with the '0x' or '0X'
8520 * (Metrowerks, Compaq Tru64)
8521 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008522 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008523 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008524 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008525 * We can achieve the desired consistency by inserting our
8526 * own '0x' or '0X' prefix, and substituting %x/%X in place
8527 * of %#x/%#X.
8528 *
8529 * Note that this is the same approach as used in
8530 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008531 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008532 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8533 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008534 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008535 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008536 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8537 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008538 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008539 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008540 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008541 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008542 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008543 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544}
Christian Heimes3fd13992008-03-21 01:05:49 +00008545#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546
8547static int
8548formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008549 size_t buflen,
8550 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008552 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008553 if (PyUnicode_Check(v)) {
8554 if (PyUnicode_GET_SIZE(v) != 1)
8555 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558 else {
8559 /* Integer input truncated to a character */
8560 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008561 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008563 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008564#ifdef Py_UNICODE_WIDE
8565 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008566 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008567 "%c arg not in range(0x110000) "
8568 "(wide Python build)");
8569 return -1;
8570 }
8571#else
8572 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008573 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008574 "%c arg not in range(0x10000) "
8575 "(narrow Python build)");
8576 return -1;
8577 }
8578#endif
8579 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 }
8581 buf[1] = '\0';
8582 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008583
8584 onError:
8585 PyErr_SetString(PyExc_TypeError,
8586 "%c requires int or char");
8587 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588}
8589
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008590/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8591
8592 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8593 chars are formatted. XXX This is a magic number. Each formatting
8594 routine does bounds checking to ensure no overflow, but a better
8595 solution may be to malloc a buffer of appropriate size for each
8596 format. For now, the current solution is sufficient.
8597*/
8598#define FORMATBUFLEN (size_t)120
8599
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600PyObject *PyUnicode_Format(PyObject *format,
8601 PyObject *args)
8602{
8603 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008604 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 int args_owned = 0;
8606 PyUnicodeObject *result = NULL;
8607 PyObject *dict = NULL;
8608 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008609
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 if (format == NULL || args == NULL) {
8611 PyErr_BadInternalCall();
8612 return NULL;
8613 }
8614 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008615 if (uformat == NULL)
8616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 fmt = PyUnicode_AS_UNICODE(uformat);
8618 fmtcnt = PyUnicode_GET_SIZE(uformat);
8619
8620 reslen = rescnt = fmtcnt + 100;
8621 result = _PyUnicode_New(reslen);
8622 if (result == NULL)
8623 goto onError;
8624 res = PyUnicode_AS_UNICODE(result);
8625
8626 if (PyTuple_Check(args)) {
8627 arglen = PyTuple_Size(args);
8628 argidx = 0;
8629 }
8630 else {
8631 arglen = -1;
8632 argidx = -2;
8633 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008634 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008635 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 dict = args;
8637
8638 while (--fmtcnt >= 0) {
8639 if (*fmt != '%') {
8640 if (--rescnt < 0) {
8641 rescnt = fmtcnt + 100;
8642 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008643 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008644 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8646 --rescnt;
8647 }
8648 *res++ = *fmt++;
8649 }
8650 else {
8651 /* Got a format specifier */
8652 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008653 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 Py_UNICODE c = '\0';
8656 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008657 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 PyObject *v = NULL;
8659 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008660 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008662 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008663 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664
8665 fmt++;
8666 if (*fmt == '(') {
8667 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008668 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 PyObject *key;
8670 int pcount = 1;
8671
8672 if (dict == NULL) {
8673 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008674 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 goto onError;
8676 }
8677 ++fmt;
8678 --fmtcnt;
8679 keystart = fmt;
8680 /* Skip over balanced parentheses */
8681 while (pcount > 0 && --fmtcnt >= 0) {
8682 if (*fmt == ')')
8683 --pcount;
8684 else if (*fmt == '(')
8685 ++pcount;
8686 fmt++;
8687 }
8688 keylen = fmt - keystart - 1;
8689 if (fmtcnt < 0 || pcount > 0) {
8690 PyErr_SetString(PyExc_ValueError,
8691 "incomplete format key");
8692 goto onError;
8693 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008694#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008695 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 then looked up since Python uses strings to hold
8697 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008698 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 key = PyUnicode_EncodeUTF8(keystart,
8700 keylen,
8701 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008702#else
8703 key = PyUnicode_FromUnicode(keystart, keylen);
8704#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 if (key == NULL)
8706 goto onError;
8707 if (args_owned) {
8708 Py_DECREF(args);
8709 args_owned = 0;
8710 }
8711 args = PyObject_GetItem(dict, key);
8712 Py_DECREF(key);
8713 if (args == NULL) {
8714 goto onError;
8715 }
8716 args_owned = 1;
8717 arglen = -1;
8718 argidx = -2;
8719 }
8720 while (--fmtcnt >= 0) {
8721 switch (c = *fmt++) {
8722 case '-': flags |= F_LJUST; continue;
8723 case '+': flags |= F_SIGN; continue;
8724 case ' ': flags |= F_BLANK; continue;
8725 case '#': flags |= F_ALT; continue;
8726 case '0': flags |= F_ZERO; continue;
8727 }
8728 break;
8729 }
8730 if (c == '*') {
8731 v = getnextarg(args, arglen, &argidx);
8732 if (v == NULL)
8733 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008734 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 PyErr_SetString(PyExc_TypeError,
8736 "* wants int");
8737 goto onError;
8738 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008739 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008740 if (width == -1 && PyErr_Occurred())
8741 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742 if (width < 0) {
8743 flags |= F_LJUST;
8744 width = -width;
8745 }
8746 if (--fmtcnt >= 0)
8747 c = *fmt++;
8748 }
8749 else if (c >= '0' && c <= '9') {
8750 width = c - '0';
8751 while (--fmtcnt >= 0) {
8752 c = *fmt++;
8753 if (c < '0' || c > '9')
8754 break;
8755 if ((width*10) / 10 != width) {
8756 PyErr_SetString(PyExc_ValueError,
8757 "width too big");
8758 goto onError;
8759 }
8760 width = width*10 + (c - '0');
8761 }
8762 }
8763 if (c == '.') {
8764 prec = 0;
8765 if (--fmtcnt >= 0)
8766 c = *fmt++;
8767 if (c == '*') {
8768 v = getnextarg(args, arglen, &argidx);
8769 if (v == NULL)
8770 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008771 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 PyErr_SetString(PyExc_TypeError,
8773 "* wants int");
8774 goto onError;
8775 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008776 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008777 if (prec == -1 && PyErr_Occurred())
8778 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 if (prec < 0)
8780 prec = 0;
8781 if (--fmtcnt >= 0)
8782 c = *fmt++;
8783 }
8784 else if (c >= '0' && c <= '9') {
8785 prec = c - '0';
8786 while (--fmtcnt >= 0) {
8787 c = Py_CHARMASK(*fmt++);
8788 if (c < '0' || c > '9')
8789 break;
8790 if ((prec*10) / 10 != prec) {
8791 PyErr_SetString(PyExc_ValueError,
8792 "prec too big");
8793 goto onError;
8794 }
8795 prec = prec*10 + (c - '0');
8796 }
8797 }
8798 } /* prec */
8799 if (fmtcnt >= 0) {
8800 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 if (--fmtcnt >= 0)
8802 c = *fmt++;
8803 }
8804 }
8805 if (fmtcnt < 0) {
8806 PyErr_SetString(PyExc_ValueError,
8807 "incomplete format");
8808 goto onError;
8809 }
8810 if (c != '%') {
8811 v = getnextarg(args, arglen, &argidx);
8812 if (v == NULL)
8813 goto onError;
8814 }
8815 sign = 0;
8816 fill = ' ';
8817 switch (c) {
8818
8819 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008820 pbuf = formatbuf;
8821 /* presume that buffer length is at least 1 */
8822 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823 len = 1;
8824 break;
8825
8826 case 's':
8827 case 'r':
8828 if (PyUnicode_Check(v) && c == 's') {
8829 temp = v;
8830 Py_INCREF(temp);
8831 }
8832 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008834 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835 else
8836 temp = PyObject_Repr(v);
8837 if (temp == NULL)
8838 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008839 if (PyUnicode_Check(temp))
8840 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008841 else {
8842 Py_DECREF(temp);
8843 PyErr_SetString(PyExc_TypeError,
8844 "%s argument has non-string str()");
8845 goto onError;
8846 }
8847 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008848 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 len = PyUnicode_GET_SIZE(temp);
8850 if (prec >= 0 && len > prec)
8851 len = prec;
8852 break;
8853
8854 case 'i':
8855 case 'd':
8856 case 'u':
8857 case 'o':
8858 case 'x':
8859 case 'X':
8860 if (c == 'i')
8861 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00008862 isnumok = 0;
8863 if (PyNumber_Check(v)) {
8864 PyObject *iobj=NULL;
8865
8866 if (PyLong_Check(v)) {
8867 iobj = v;
8868 Py_INCREF(iobj);
8869 }
8870 else {
8871 iobj = PyNumber_Long(v);
8872 }
8873 if (iobj!=NULL) {
8874 if (PyLong_Check(iobj)) {
8875 isnumok = 1;
8876 temp = formatlong(iobj, flags, prec, c);
8877 Py_DECREF(iobj);
8878 if (!temp)
8879 goto onError;
8880 pbuf = PyUnicode_AS_UNICODE(temp);
8881 len = PyUnicode_GET_SIZE(temp);
8882 sign = 1;
8883 }
8884 else {
8885 Py_DECREF(iobj);
8886 }
8887 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 }
Christian Heimesa612dc02008-02-24 13:08:18 +00008889 if (!isnumok) {
8890 PyErr_Format(PyExc_TypeError,
8891 "%%%c format: a number is required, "
8892 "not %.200s", c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008893 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008894 }
8895 if (flags & F_ZERO)
8896 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897 break;
8898
8899 case 'e':
8900 case 'E':
8901 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008902 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 case 'g':
8904 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008905 if (c == 'F')
8906 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008907 pbuf = formatbuf;
8908 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8909 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910 if (len < 0)
8911 goto onError;
8912 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008913 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 fill = '0';
8915 break;
8916
8917 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008918 pbuf = formatbuf;
8919 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920 if (len < 0)
8921 goto onError;
8922 break;
8923
8924 default:
8925 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008926 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008927 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008928 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008929 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008930 (Py_ssize_t)(fmt - 1 -
8931 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 goto onError;
8933 }
8934 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008935 if (*pbuf == '-' || *pbuf == '+') {
8936 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 len--;
8938 }
8939 else if (flags & F_SIGN)
8940 sign = '+';
8941 else if (flags & F_BLANK)
8942 sign = ' ';
8943 else
8944 sign = 0;
8945 }
8946 if (width < len)
8947 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008948 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 reslen -= rescnt;
8950 rescnt = width + fmtcnt + 100;
8951 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008952 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008953 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008954 PyErr_NoMemory();
8955 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008956 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008957 if (_PyUnicode_Resize(&result, reslen) < 0) {
8958 Py_XDECREF(temp);
8959 goto onError;
8960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 res = PyUnicode_AS_UNICODE(result)
8962 + reslen - rescnt;
8963 }
8964 if (sign) {
8965 if (fill != ' ')
8966 *res++ = sign;
8967 rescnt--;
8968 if (width > len)
8969 width--;
8970 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008971 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008972 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008973 assert(pbuf[1] == c);
8974 if (fill != ' ') {
8975 *res++ = *pbuf++;
8976 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008977 }
Tim Petersfff53252001-04-12 18:38:48 +00008978 rescnt -= 2;
8979 width -= 2;
8980 if (width < 0)
8981 width = 0;
8982 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 if (width > len && !(flags & F_LJUST)) {
8985 do {
8986 --rescnt;
8987 *res++ = fill;
8988 } while (--width > len);
8989 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008990 if (fill == ' ') {
8991 if (sign)
8992 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008993 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008994 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008995 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008996 *res++ = *pbuf++;
8997 *res++ = *pbuf++;
8998 }
8999 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009000 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001 res += len;
9002 rescnt -= len;
9003 while (--width >= len) {
9004 --rescnt;
9005 *res++ = ' ';
9006 }
9007 if (dict && (argidx < arglen) && c != '%') {
9008 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009009 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009010 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 goto onError;
9012 }
9013 Py_XDECREF(temp);
9014 } /* '%' */
9015 } /* until end */
9016 if (argidx < arglen && !dict) {
9017 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009018 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 goto onError;
9020 }
9021
Thomas Woutersa96affe2006-03-12 00:29:36 +00009022 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9023 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024 if (args_owned) {
9025 Py_DECREF(args);
9026 }
9027 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 return (PyObject *)result;
9029
9030 onError:
9031 Py_XDECREF(result);
9032 Py_DECREF(uformat);
9033 if (args_owned) {
9034 Py_DECREF(args);
9035 }
9036 return NULL;
9037}
9038
Jeremy Hylton938ace62002-07-17 16:30:39 +00009039static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009040unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9041
Tim Peters6d6c1a32001-08-02 04:15:00 +00009042static PyObject *
9043unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9044{
9045 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009046 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009047 char *encoding = NULL;
9048 char *errors = NULL;
9049
Guido van Rossume023fe02001-08-30 03:12:59 +00009050 if (type != &PyUnicode_Type)
9051 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009052 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
9053 kwlist, &x, &encoding, &errors))
9054 return NULL;
9055 if (x == NULL)
9056 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009057 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009058 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009059 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009060 return PyUnicode_FromEncodedObject(x, encoding, errors);
9061}
9062
Guido van Rossume023fe02001-08-30 03:12:59 +00009063static PyObject *
9064unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9065{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009066 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009067 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009068
9069 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9070 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9071 if (tmp == NULL)
9072 return NULL;
9073 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009074 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009075 if (pnew == NULL) {
9076 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009077 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009078 }
Christian Heimesb186d002008-03-18 15:15:01 +00009079 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009080 if (pnew->str == NULL) {
9081 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009082 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009083 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009084 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009085 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009086 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9087 pnew->length = n;
9088 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009089 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009090 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009091}
9092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009093PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00009094"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009095\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009096Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009097encoding defaults to the current default string encoding.\n\
9098errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009099
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009100static PyObject *unicode_iter(PyObject *seq);
9101
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009103 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009104 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105 sizeof(PyUnicodeObject), /* tp_size */
9106 0, /* tp_itemsize */
9107 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009108 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009110 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009112 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009113 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009114 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009116 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 (hashfunc) unicode_hash, /* tp_hash*/
9118 0, /* tp_call*/
9119 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009120 PyObject_GenericGetAttr, /* tp_getattro */
9121 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009122 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009123 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9124 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009125 unicode_doc, /* tp_doc */
9126 0, /* tp_traverse */
9127 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009128 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009129 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009130 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009131 0, /* tp_iternext */
9132 unicode_methods, /* tp_methods */
9133 0, /* tp_members */
9134 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009135 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009136 0, /* tp_dict */
9137 0, /* tp_descr_get */
9138 0, /* tp_descr_set */
9139 0, /* tp_dictoffset */
9140 0, /* tp_init */
9141 0, /* tp_alloc */
9142 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009143 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144};
9145
9146/* Initialize the Unicode implementation */
9147
Thomas Wouters78890102000-07-22 19:25:51 +00009148void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009150 int i;
9151
Thomas Wouters477c8d52006-05-27 19:21:47 +00009152 /* XXX - move this array to unicodectype.c ? */
9153 Py_UNICODE linebreak[] = {
9154 0x000A, /* LINE FEED */
9155 0x000D, /* CARRIAGE RETURN */
9156 0x001C, /* FILE SEPARATOR */
9157 0x001D, /* GROUP SEPARATOR */
9158 0x001E, /* RECORD SEPARATOR */
9159 0x0085, /* NEXT LINE */
9160 0x2028, /* LINE SEPARATOR */
9161 0x2029, /* PARAGRAPH SEPARATOR */
9162 };
9163
Fred Drakee4315f52000-05-09 19:53:39 +00009164 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009165 free_list = NULL;
9166 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009168 if (!unicode_empty)
9169 return;
9170
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009171 for (i = 0; i < 256; i++)
9172 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009173 if (PyType_Ready(&PyUnicode_Type) < 0)
9174 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009175
9176 /* initialize the linebreak bloom filter */
9177 bloom_linebreak = make_bloom_mask(
9178 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9179 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009180
9181 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182}
9183
9184/* Finalize the Unicode implementation */
9185
Christian Heimesa156e092008-02-16 07:38:31 +00009186int
9187PyUnicode_ClearFreeList(void)
9188{
9189 int freelist_size = numfree;
9190 PyUnicodeObject *u;
9191
9192 for (u = free_list; u != NULL;) {
9193 PyUnicodeObject *v = u;
9194 u = *(PyUnicodeObject **)u;
9195 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009196 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009197 Py_XDECREF(v->defenc);
9198 PyObject_Del(v);
9199 numfree--;
9200 }
9201 free_list = NULL;
9202 assert(numfree == 0);
9203 return freelist_size;
9204}
9205
Guido van Rossumd57fd912000-03-10 22:53:23 +00009206void
Thomas Wouters78890102000-07-22 19:25:51 +00009207_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009209 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009211 Py_XDECREF(unicode_empty);
9212 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009213
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009214 for (i = 0; i < 256; i++) {
9215 if (unicode_latin1[i]) {
9216 Py_DECREF(unicode_latin1[i]);
9217 unicode_latin1[i] = NULL;
9218 }
9219 }
Christian Heimesa156e092008-02-16 07:38:31 +00009220 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009222
Walter Dörwald16807132007-05-25 13:52:07 +00009223void
9224PyUnicode_InternInPlace(PyObject **p)
9225{
9226 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9227 PyObject *t;
9228 if (s == NULL || !PyUnicode_Check(s))
9229 Py_FatalError(
9230 "PyUnicode_InternInPlace: unicode strings only please!");
9231 /* If it's a subclass, we don't really know what putting
9232 it in the interned dict might do. */
9233 if (!PyUnicode_CheckExact(s))
9234 return;
9235 if (PyUnicode_CHECK_INTERNED(s))
9236 return;
9237 if (interned == NULL) {
9238 interned = PyDict_New();
9239 if (interned == NULL) {
9240 PyErr_Clear(); /* Don't leave an exception */
9241 return;
9242 }
9243 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009244 /* It might be that the GetItem call fails even
9245 though the key is present in the dictionary,
9246 namely when this happens during a stack overflow. */
9247 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009248 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009249 Py_END_ALLOW_RECURSION
9250
Walter Dörwald16807132007-05-25 13:52:07 +00009251 if (t) {
9252 Py_INCREF(t);
9253 Py_DECREF(*p);
9254 *p = t;
9255 return;
9256 }
9257
Martin v. Löwis5b222132007-06-10 09:51:05 +00009258 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009259 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9260 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009261 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009262 return;
9263 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009264 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009265 /* The two references in interned are not counted by refcnt.
9266 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009267 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009268 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9269}
9270
9271void
9272PyUnicode_InternImmortal(PyObject **p)
9273{
9274 PyUnicode_InternInPlace(p);
9275 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9276 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9277 Py_INCREF(*p);
9278 }
9279}
9280
9281PyObject *
9282PyUnicode_InternFromString(const char *cp)
9283{
9284 PyObject *s = PyUnicode_FromString(cp);
9285 if (s == NULL)
9286 return NULL;
9287 PyUnicode_InternInPlace(&s);
9288 return s;
9289}
9290
9291void _Py_ReleaseInternedUnicodeStrings(void)
9292{
9293 PyObject *keys;
9294 PyUnicodeObject *s;
9295 Py_ssize_t i, n;
9296 Py_ssize_t immortal_size = 0, mortal_size = 0;
9297
9298 if (interned == NULL || !PyDict_Check(interned))
9299 return;
9300 keys = PyDict_Keys(interned);
9301 if (keys == NULL || !PyList_Check(keys)) {
9302 PyErr_Clear();
9303 return;
9304 }
9305
9306 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9307 detector, interned unicode strings are not forcibly deallocated;
9308 rather, we give them their stolen references back, and then clear
9309 and DECREF the interned dict. */
9310
9311 n = PyList_GET_SIZE(keys);
9312 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9313 n);
9314 for (i = 0; i < n; i++) {
9315 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9316 switch (s->state) {
9317 case SSTATE_NOT_INTERNED:
9318 /* XXX Shouldn't happen */
9319 break;
9320 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009321 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009322 immortal_size += s->length;
9323 break;
9324 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009325 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009326 mortal_size += s->length;
9327 break;
9328 default:
9329 Py_FatalError("Inconsistent interned string state.");
9330 }
9331 s->state = SSTATE_NOT_INTERNED;
9332 }
9333 fprintf(stderr, "total size of all interned strings: "
9334 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9335 "mortal/immortal\n", mortal_size, immortal_size);
9336 Py_DECREF(keys);
9337 PyDict_Clear(interned);
9338 Py_DECREF(interned);
9339 interned = NULL;
9340}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009341
9342
9343/********************* Unicode Iterator **************************/
9344
9345typedef struct {
9346 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009347 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009348 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9349} unicodeiterobject;
9350
9351static void
9352unicodeiter_dealloc(unicodeiterobject *it)
9353{
9354 _PyObject_GC_UNTRACK(it);
9355 Py_XDECREF(it->it_seq);
9356 PyObject_GC_Del(it);
9357}
9358
9359static int
9360unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9361{
9362 Py_VISIT(it->it_seq);
9363 return 0;
9364}
9365
9366static PyObject *
9367unicodeiter_next(unicodeiterobject *it)
9368{
9369 PyUnicodeObject *seq;
9370 PyObject *item;
9371
9372 assert(it != NULL);
9373 seq = it->it_seq;
9374 if (seq == NULL)
9375 return NULL;
9376 assert(PyUnicode_Check(seq));
9377
9378 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009379 item = PyUnicode_FromUnicode(
9380 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009381 if (item != NULL)
9382 ++it->it_index;
9383 return item;
9384 }
9385
9386 Py_DECREF(seq);
9387 it->it_seq = NULL;
9388 return NULL;
9389}
9390
9391static PyObject *
9392unicodeiter_len(unicodeiterobject *it)
9393{
9394 Py_ssize_t len = 0;
9395 if (it->it_seq)
9396 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009397 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009398}
9399
9400PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9401
9402static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009403 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9404 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009405 {NULL, NULL} /* sentinel */
9406};
9407
9408PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009409 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009410 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009411 sizeof(unicodeiterobject), /* tp_basicsize */
9412 0, /* tp_itemsize */
9413 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009414 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009415 0, /* tp_print */
9416 0, /* tp_getattr */
9417 0, /* tp_setattr */
9418 0, /* tp_compare */
9419 0, /* tp_repr */
9420 0, /* tp_as_number */
9421 0, /* tp_as_sequence */
9422 0, /* tp_as_mapping */
9423 0, /* tp_hash */
9424 0, /* tp_call */
9425 0, /* tp_str */
9426 PyObject_GenericGetAttr, /* tp_getattro */
9427 0, /* tp_setattro */
9428 0, /* tp_as_buffer */
9429 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9430 0, /* tp_doc */
9431 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9432 0, /* tp_clear */
9433 0, /* tp_richcompare */
9434 0, /* tp_weaklistoffset */
9435 PyObject_SelfIter, /* tp_iter */
9436 (iternextfunc)unicodeiter_next, /* tp_iternext */
9437 unicodeiter_methods, /* tp_methods */
9438 0,
9439};
9440
9441static PyObject *
9442unicode_iter(PyObject *seq)
9443{
9444 unicodeiterobject *it;
9445
9446 if (!PyUnicode_Check(seq)) {
9447 PyErr_BadInternalCall();
9448 return NULL;
9449 }
9450 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9451 if (it == NULL)
9452 return NULL;
9453 it->it_index = 0;
9454 Py_INCREF(seq);
9455 it->it_seq = (PyUnicodeObject *)seq;
9456 _PyObject_GC_TRACK(it);
9457 return (PyObject *)it;
9458}
9459
Martin v. Löwis5b222132007-06-10 09:51:05 +00009460size_t
9461Py_UNICODE_strlen(const Py_UNICODE *u)
9462{
9463 int res = 0;
9464 while(*u++)
9465 res++;
9466 return res;
9467}
9468
9469Py_UNICODE*
9470Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9471{
9472 Py_UNICODE *u = s1;
9473 while ((*u++ = *s2++));
9474 return s1;
9475}
9476
9477Py_UNICODE*
9478Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9479{
9480 Py_UNICODE *u = s1;
9481 while ((*u++ = *s2++))
9482 if (n-- == 0)
9483 break;
9484 return s1;
9485}
9486
9487int
9488Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9489{
9490 while (*s1 && *s2 && *s1 == *s2)
9491 s1++, s2++;
9492 if (*s1 && *s2)
9493 return (*s1 < *s2) ? -1 : +1;
9494 if (*s1)
9495 return 1;
9496 if (*s2)
9497 return -1;
9498 return 0;
9499}
9500
9501Py_UNICODE*
9502Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9503{
9504 const Py_UNICODE *p;
9505 for (p = s; *p; p++)
9506 if (*p == c)
9507 return (Py_UNICODE*)p;
9508 return NULL;
9509}
9510
9511
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009512#ifdef __cplusplus
9513}
9514#endif
9515
9516
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009517/*
9518Local variables:
9519c-basic-offset: 4
9520indent-tabs-mode: nil
9521End:
9522*/