blob: dab4000a69f0b07367e43d146cf52b2169f48b31 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
Christian Heimes2202f872008-02-06 14:31:34 +000057#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Christian Heimes2202f872008-02-06 14:31:34 +000065 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000109static PyUnicodeObject *free_list;
110static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Christian Heimes190d79e2008-01-30 11:58:22 +0000128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131// case 0x0009: /* HORIZONTAL TABULATION */
132// case 0x000A: /* LINE FEED */
133// case 0x000B: /* VERTICAL TABULATION */
134// case 0x000C: /* FORM FEED */
135// case 0x000D: /* CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138// case 0x001C: /* FILE SEPARATOR */
139// case 0x001D: /* GROUP SEPARATOR */
140// case 0x001E: /* RECORD SEPARATOR */
141// case 0x001F: /* UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143// case 0x0020: /* SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162// 0x000A, /* LINE FEED */
163// 0x000D, /* CARRIAGE RETURN */
164 0, 0, 1, 0, 0, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166// 0x001C, /* FILE SEPARATOR */
167// 0x001D, /* GROUP SEPARATOR */
168// 0x001E, /* RECORD SEPARATOR */
169 0, 0, 0, 0, 1, 1, 1, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0
183};
184
185
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000187PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000189#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190 return 0x10FFFF;
191#else
192 /* This is actually an illegal character, so it should
193 not be passed to unichr. */
194 return 0xFFFF;
195#endif
196}
197
Thomas Wouters477c8d52006-05-27 19:21:47 +0000198/* --- Bloom Filters ----------------------------------------------------- */
199
200/* stuff to implement simple "bloom filters" for Unicode characters.
201 to keep things simple, we use a single bitmask, using the least 5
202 bits from each unicode characters as the bit index. */
203
204/* the linebreak mask is set up by Unicode_Init below */
205
206#define BLOOM_MASK unsigned long
207
208static BLOOM_MASK bloom_linebreak;
209
210#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
211
Christian Heimes190d79e2008-01-30 11:58:22 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
220 long mask;
221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
225 mask |= (1 << (ptr[i] & 0x1F));
226
227 return mask;
228}
229
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
238 return 0;
239}
240
241#define BLOOM_MEMBER(mask, chr, set, setlen)\
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 if (unicode == unicode_empty ||
261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
288 Py_DECREF(unicode->defenc);
289 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 }
291 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return 0;
294}
295
296/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000297 Ux0000 terminated; some code (e.g. new_identifier)
298 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299
300 XXX This allocator could further be enhanced by assuring that the
301 free list never reduces its size below 1.
302
303*/
304
305static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000306PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307{
308 register PyUnicodeObject *unicode;
309
Thomas Wouters477c8d52006-05-27 19:21:47 +0000310 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 if (length == 0 && unicode_empty != NULL) {
312 Py_INCREF(unicode_empty);
313 return unicode_empty;
314 }
315
316 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000317 if (free_list) {
318 unicode = free_list;
319 free_list = *(PyUnicodeObject **)unicode;
320 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000322 /* Keep-Alive optimization: we only upsize the buffer,
323 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000324 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000325 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000326 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328 }
329 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000330 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000331 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
332 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 }
334 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 }
336 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000337 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000338 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 if (unicode == NULL)
340 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000341 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
342 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000345 if (!unicode->str) {
346 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000347 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000349 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000350 * the caller fails before initializing str -- unicode_resize()
351 * reads str[0], and the Keep-Alive optimization can keep memory
352 * allocated for str alive across a call to unicode_dealloc(unicode).
353 * We don't want unicode_resize to read uninitialized memory in
354 * that case.
355 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000356 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000360 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000361 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000363
364 onError:
365 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000366 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368}
369
370static
Guido van Rossum9475a232001-10-05 20:51:39 +0000371void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372{
Walter Dörwald16807132007-05-25 13:52:07 +0000373 switch (PyUnicode_CHECK_INTERNED(unicode)) {
374 case SSTATE_NOT_INTERNED:
375 break;
376
377 case SSTATE_INTERNED_MORTAL:
378 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000379 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000380 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
381 Py_FatalError(
382 "deletion of interned unicode string failed");
383 break;
384
385 case SSTATE_INTERNED_IMMORTAL:
386 Py_FatalError("Immortal interned unicode string died.");
387
388 default:
389 Py_FatalError("Inconsistent interned unicode string state.");
390 }
391
Guido van Rossum604ddf82001-12-06 20:03:56 +0000392 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000393 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000394 /* Keep-Alive optimization */
395 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000396 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 unicode->str = NULL;
398 unicode->length = 0;
399 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000400 if (unicode->defenc) {
401 Py_DECREF(unicode->defenc);
402 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000403 }
404 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000405 *(PyUnicodeObject **)unicode = free_list;
406 free_list = unicode;
407 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000410 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000411 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000412 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414}
415
Martin v. Löwis18e16552006-02-15 17:27:45 +0000416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000417{
418 register PyUnicodeObject *v;
419
420 /* Argument checks */
421 if (unicode == NULL) {
422 PyErr_BadInternalCall();
423 return -1;
424 }
425 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000426 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 PyErr_BadInternalCall();
428 return -1;
429 }
430
431 /* Resizing unicode_empty and single character objects is not
432 possible since these are being shared. We simply return a fresh
433 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000434 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 (v == unicode_empty || v->length == 1)) {
436 PyUnicodeObject *w = _PyUnicode_New(length);
437 if (w == NULL)
438 return -1;
439 Py_UNICODE_COPY(w->str, v->str,
440 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000441 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 *unicode = (PyObject *)w;
443 return 0;
444 }
445
446 /* Note that we don't have to modify *unicode for unshared Unicode
447 objects, since we can modify them in-place. */
448 return unicode_resize(v, length);
449}
450
451/* Internal API for use in unicodeobject.c only ! */
452#define _PyUnicode_Resize(unicodevar, length) \
453 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000456 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457{
458 PyUnicodeObject *unicode;
459
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 /* If the Unicode data is known at construction time, we can apply
461 some optimizations which share commonly used objects. */
462 if (u != NULL) {
463
464 /* Optimization for empty strings */
465 if (size == 0 && unicode_empty != NULL) {
466 Py_INCREF(unicode_empty);
467 return (PyObject *)unicode_empty;
468 }
469
470 /* Single character Unicode objects in the Latin-1 range are
471 shared when using this constructor */
472 if (size == 1 && *u < 256) {
473 unicode = unicode_latin1[*u];
474 if (!unicode) {
475 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476 if (!unicode)
477 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000478 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479 unicode_latin1[*u] = unicode;
480 }
481 Py_INCREF(unicode);
482 return (PyObject *)unicode;
483 }
484 }
Tim Petersced69f82003-09-16 20:30:58 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 unicode = _PyUnicode_New(size);
487 if (!unicode)
488 return NULL;
489
490 /* Copy the Unicode data into the new object */
491 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000492 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493
494 return (PyObject *)unicode;
495}
496
Walter Dörwaldd2034312007-05-18 16:29:38 +0000497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000498{
499 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000500
501 if (size < 0) {
502 PyErr_SetString(PyExc_SystemError,
503 "Negative size passed to PyUnicode_FromStringAndSize");
504 return NULL;
505 }
506
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000507 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000508 some optimizations which share commonly used objects.
509 Also, this means the input must be UTF-8, so fall back to the
510 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000511 if (u != NULL) {
512
513 /* Optimization for empty strings */
514 if (size == 0 && unicode_empty != NULL) {
515 Py_INCREF(unicode_empty);
516 return (PyObject *)unicode_empty;
517 }
518
Martin v. Löwis9c121062007-08-05 20:26:11 +0000519 /* Single characters are shared when using this constructor.
520 Restrict to ASCII, since the input must be UTF-8. */
521 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000522 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (!unicode) {
524 unicode = _PyUnicode_New(1);
525 if (!unicode)
526 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000527 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000528 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529 }
530 Py_INCREF(unicode);
531 return (PyObject *)unicode;
532 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000533
534 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000535 }
536
Walter Dörwald55507312007-05-18 13:12:10 +0000537 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 if (!unicode)
539 return NULL;
540
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000541 return (PyObject *)unicode;
542}
543
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544PyObject *PyUnicode_FromString(const char *u)
545{
546 size_t size = strlen(u);
547 if (size > PY_SSIZE_T_MAX) {
548 PyErr_SetString(PyExc_OverflowError, "input too long");
549 return NULL;
550 }
551
552 return PyUnicode_FromStringAndSize(u, size);
553}
554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555#ifdef HAVE_WCHAR_H
556
557PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000558 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559{
560 PyUnicodeObject *unicode;
561
562 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000563 if (size == 0)
564 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 PyErr_BadInternalCall();
566 return NULL;
567 }
568
Martin v. Löwis790465f2008-04-05 20:41:37 +0000569 if (size == -1) {
570 size = wcslen(w);
571 }
572
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 unicode = _PyUnicode_New(size);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578#ifdef HAVE_USABLE_WCHAR_T
579 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000580#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 {
582 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000583 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000585 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 *u++ = *w++;
587 }
588#endif
589
590 return (PyObject *)unicode;
591}
592
Walter Dörwald346737f2007-05-31 10:44:43 +0000593static void
594makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
595{
596 *fmt++ = '%';
597 if (width) {
598 if (zeropad)
599 *fmt++ = '0';
600 fmt += sprintf(fmt, "%d", width);
601 }
602 if (precision)
603 fmt += sprintf(fmt, ".%d", precision);
604 if (longflag)
605 *fmt++ = 'l';
606 else if (size_tflag) {
607 char *f = PY_FORMAT_SIZE_T;
608 while (*f)
609 *fmt++ = *f++;
610 }
611 *fmt++ = c;
612 *fmt = '\0';
613}
614
Walter Dörwaldd2034312007-05-18 16:29:38 +0000615#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
616
617PyObject *
618PyUnicode_FromFormatV(const char *format, va_list vargs)
619{
620 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000621 Py_ssize_t callcount = 0;
622 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000623 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000624 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000625 int width = 0;
626 int precision = 0;
627 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000628 const char* f;
629 Py_UNICODE *s;
630 PyObject *string;
631 /* used by sprintf */
632 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000633 /* use abuffer instead of buffer, if we need more space
634 * (which can happen if there's a format specifier with width). */
635 char *abuffer = NULL;
636 char *realbuffer;
637 Py_ssize_t abuffersize = 0;
638 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000639 const char *copy;
640
641#ifdef VA_LIST_IS_ARRAY
642 Py_MEMCPY(count, vargs, sizeof(va_list));
643#else
644#ifdef __va_copy
645 __va_copy(count, vargs);
646#else
647 count = vargs;
648#endif
649#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000650 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000651 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000652 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000653 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000654 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000655 ++callcount;
656 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000657 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000658 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000660 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 if (!callresults) {
662 PyErr_NoMemory();
663 return NULL;
664 }
665 callresult = callresults;
666 }
667 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000668 for (f = format; *f; f++) {
669 if (*f == '%') {
670 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000671 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000672 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000673 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000674 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000675 ;
676
677 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
678 * they don't affect the amount of space we reserve.
679 */
680 if ((*f == 'l' || *f == 'z') &&
681 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000682 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000683
684 switch (*f) {
685 case 'c':
686 (void)va_arg(count, int);
687 /* fall through... */
688 case '%':
689 n++;
690 break;
691 case 'd': case 'u': case 'i': case 'x':
692 (void) va_arg(count, int);
693 /* 20 bytes is enough to hold a 64-bit
694 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000695 This isn't enough for octal.
696 If a width is specified we need more
697 (which we allocate later). */
698 if (width < 20)
699 width = 20;
700 n += width;
701 if (abuffersize < width)
702 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000703 break;
704 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000705 {
706 /* UTF-8 */
707 unsigned char*s;
708 s = va_arg(count, unsigned char*);
709 while (*s) {
710 if (*s < 128) {
711 n++; s++;
712 } else if (*s < 0xc0) {
713 /* invalid UTF-8 */
714 n++; s++;
715 } else if (*s < 0xc0) {
716 n++;
717 s++; if(!*s)break;
718 s++;
719 } else if (*s < 0xe0) {
720 n++;
721 s++; if(!*s)break;
722 s++; if(!*s)break;
723 s++;
724 } else {
725 #ifdef Py_UNICODE_WIDE
726 n++;
727 #else
728 n+=2;
729 #endif
730 s++; if(!*s)break;
731 s++; if(!*s)break;
732 s++; if(!*s)break;
733 s++;
734 }
735 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000737 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000738 case 'U':
739 {
740 PyObject *obj = va_arg(count, PyObject *);
741 assert(obj && PyUnicode_Check(obj));
742 n += PyUnicode_GET_SIZE(obj);
743 break;
744 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000745 case 'V':
746 {
747 PyObject *obj = va_arg(count, PyObject *);
748 const char *str = va_arg(count, const char *);
749 assert(obj || str);
750 assert(!obj || PyUnicode_Check(obj));
751 if (obj)
752 n += PyUnicode_GET_SIZE(obj);
753 else
754 n += strlen(str);
755 break;
756 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000757 case 'S':
758 {
759 PyObject *obj = va_arg(count, PyObject *);
760 PyObject *str;
761 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000762 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000763 if (!str)
764 goto fail;
765 n += PyUnicode_GET_SIZE(str);
766 /* Remember the str and switch to the next slot */
767 *callresult++ = str;
768 break;
769 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000770 case 'R':
771 {
772 PyObject *obj = va_arg(count, PyObject *);
773 PyObject *repr;
774 assert(obj);
775 repr = PyObject_Repr(obj);
776 if (!repr)
777 goto fail;
778 n += PyUnicode_GET_SIZE(repr);
779 /* Remember the repr and switch to the next slot */
780 *callresult++ = repr;
781 break;
782 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000783 case 'p':
784 (void) va_arg(count, int);
785 /* maximum 64-bit pointer representation:
786 * 0xffffffffffffffff
787 * so 19 characters is enough.
788 * XXX I count 18 -- what's the extra for?
789 */
790 n += 19;
791 break;
792 default:
793 /* if we stumble upon an unknown
794 formatting code, copy the rest of
795 the format string to the output
796 string. (we cannot just skip the
797 code, since there's no way to know
798 what's in the argument list) */
799 n += strlen(p);
800 goto expand;
801 }
802 } else
803 n++;
804 }
805 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000806 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000807 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000808 if (!abuffer) {
809 PyErr_NoMemory();
810 goto fail;
811 }
812 realbuffer = abuffer;
813 }
814 else
815 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000818 we don't have to resize the string.
819 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820 string = PyUnicode_FromUnicode(NULL, n);
821 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000822 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000823
824 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000825 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000826
827 for (f = format; *f; f++) {
828 if (*f == '%') {
829 const char* p = f++;
830 int longflag = 0;
831 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000832 zeropad = (*f == '0');
833 /* parse the width.precision part */
834 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000835 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000836 width = (width*10) + *f++ - '0';
837 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000838 if (*f == '.') {
839 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000840 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000841 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000842 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000843 /* handle the long flag, but only for %ld and %lu.
844 others can be added when necessary. */
845 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
846 longflag = 1;
847 ++f;
848 }
849 /* handle the size_t flag. */
850 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
851 size_tflag = 1;
852 ++f;
853 }
854
855 switch (*f) {
856 case 'c':
857 *s++ = va_arg(vargs, int);
858 break;
859 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000862 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000864 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000865 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000866 sprintf(realbuffer, fmt, va_arg(vargs, int));
867 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000868 break;
869 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000870 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000871 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000872 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000873 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000874 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000875 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000876 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
877 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000878 break;
879 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000880 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
881 sprintf(realbuffer, fmt, va_arg(vargs, int));
882 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000883 break;
884 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
886 sprintf(realbuffer, fmt, va_arg(vargs, int));
887 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000888 break;
889 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000890 {
891 /* Parameter must be UTF-8 encoded.
892 In case of encoding errors, use
893 the replacement character. */
894 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000895 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000896 u = PyUnicode_DecodeUTF8(p, strlen(p),
897 "replace");
898 if (!u)
899 goto fail;
900 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
901 PyUnicode_GET_SIZE(u));
902 s += PyUnicode_GET_SIZE(u);
903 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000905 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000906 case 'U':
907 {
908 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000909 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
910 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
911 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000912 break;
913 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000914 case 'V':
915 {
916 PyObject *obj = va_arg(vargs, PyObject *);
917 const char *str = va_arg(vargs, const char *);
918 if (obj) {
919 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
920 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
921 s += size;
922 } else {
923 appendstring(str);
924 }
925 break;
926 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000927 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000928 case 'R':
929 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000930 Py_UNICODE *ucopy;
931 Py_ssize_t usize;
932 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000933 /* unused, since we already have the result */
934 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000935 ucopy = PyUnicode_AS_UNICODE(*callresult);
936 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000937 for (upos = 0; upos<usize;)
938 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000939 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000940 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000941 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000942 ++callresult;
943 break;
944 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000945 case 'p':
946 sprintf(buffer, "%p", va_arg(vargs, void*));
947 /* %p is ill-defined: ensure leading 0x. */
948 if (buffer[1] == 'X')
949 buffer[1] = 'x';
950 else if (buffer[1] != 'x') {
951 memmove(buffer+2, buffer, strlen(buffer)+1);
952 buffer[0] = '0';
953 buffer[1] = 'x';
954 }
955 appendstring(buffer);
956 break;
957 case '%':
958 *s++ = '%';
959 break;
960 default:
961 appendstring(p);
962 goto end;
963 }
964 } else
965 *s++ = *f;
966 }
967
968 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000969 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000970 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000971 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000972 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000973 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
974 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000975 fail:
976 if (callresults) {
977 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000978 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000979 Py_DECREF(*callresult2);
980 ++callresult2;
981 }
Christian Heimesb186d002008-03-18 15:15:01 +0000982 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000983 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000984 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000985 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000986 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000987}
988
989#undef appendstring
990
991PyObject *
992PyUnicode_FromFormat(const char *format, ...)
993{
994 PyObject* ret;
995 va_list vargs;
996
997#ifdef HAVE_STDARG_PROTOTYPES
998 va_start(vargs, format);
999#else
1000 va_start(vargs);
1001#endif
1002 ret = PyUnicode_FromFormatV(format, vargs);
1003 va_end(vargs);
1004 return ret;
1005}
1006
Martin v. Löwis18e16552006-02-15 17:27:45 +00001007Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1008 wchar_t *w,
1009 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001010{
1011 if (unicode == NULL) {
1012 PyErr_BadInternalCall();
1013 return -1;
1014 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001015
1016 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001018 size = PyUnicode_GET_SIZE(unicode) + 1;
1019
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020#ifdef HAVE_USABLE_WCHAR_T
1021 memcpy(w, unicode->str, size * sizeof(wchar_t));
1022#else
1023 {
1024 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001025 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001027 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 *w++ = *u++;
1029 }
1030#endif
1031
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001032 if (size > PyUnicode_GET_SIZE(unicode))
1033 return PyUnicode_GET_SIZE(unicode);
1034 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001035 return size;
1036}
1037
1038#endif
1039
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001040PyObject *PyUnicode_FromOrdinal(int ordinal)
1041{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001042 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001043
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001044 if (ordinal < 0 || ordinal > 0x10ffff) {
1045 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001046 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001047 return NULL;
1048 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001049
1050#ifndef Py_UNICODE_WIDE
1051 if (ordinal > 0xffff) {
1052 ordinal -= 0x10000;
1053 s[0] = 0xD800 | (ordinal >> 10);
1054 s[1] = 0xDC00 | (ordinal & 0x3FF);
1055 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001056 }
1057#endif
1058
Hye-Shik Chang40574832004-04-06 07:24:51 +00001059 s[0] = (Py_UNICODE)ordinal;
1060 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061}
1062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063PyObject *PyUnicode_FromObject(register PyObject *obj)
1064{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001066 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001067 if (PyUnicode_CheckExact(obj)) {
1068 Py_INCREF(obj);
1069 return obj;
1070 }
1071 if (PyUnicode_Check(obj)) {
1072 /* For a Unicode subtype that's not a Unicode object,
1073 return a true Unicode object with the same data. */
1074 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1075 PyUnicode_GET_SIZE(obj));
1076 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001077 PyErr_Format(PyExc_TypeError,
1078 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001079 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001080 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001081}
1082
1083PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1084 const char *encoding,
1085 const char *errors)
1086{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001087 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001089 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001090
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 if (obj == NULL) {
1092 PyErr_BadInternalCall();
1093 return NULL;
1094 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001096 if (PyUnicode_Check(obj)) {
1097 PyErr_SetString(PyExc_TypeError,
1098 "decoding Unicode is not supported");
1099 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001101
1102 /* Coerce object */
1103 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001104 s = PyString_AS_STRING(obj);
1105 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001106 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001107 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1108 /* Overwrite the error message with something more useful in
1109 case of a TypeError. */
1110 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001111 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001112 "coercing to Unicode: need string or buffer, "
1113 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001114 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001115 goto onError;
1116 }
Tim Petersced69f82003-09-16 20:30:58 +00001117
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001118 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 if (len == 0) {
1120 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001121 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 }
Tim Petersced69f82003-09-16 20:30:58 +00001123 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001124 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001125
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001126 return v;
1127
1128 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130}
1131
1132PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001133 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 const char *encoding,
1135 const char *errors)
1136{
1137 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001138 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001139 char lower[20]; /* Enough for any encoding name we recognize */
1140 char *l;
1141 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001142
1143 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001144 encoding = PyUnicode_GetDefaultEncoding();
1145
1146 /* Convert encoding to lower case and replace '_' with '-' in order to
1147 catch e.g. UTF_8 */
1148 e = encoding;
1149 l = lower;
1150 while (*e && l < &lower[(sizeof lower) - 2]) {
1151 if (ISUPPER(*e)) {
1152 *l++ = TOLOWER(*e++);
1153 }
1154 else if (*e == '_') {
1155 *l++ = '-';
1156 e++;
1157 }
1158 else {
1159 *l++ = *e++;
1160 }
1161 }
1162 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001163
1164 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001165 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001167 else if ((strcmp(lower, "latin-1") == 0) ||
1168 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001169 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001170#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001171 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001172 return PyUnicode_DecodeMBCS(s, size, errors);
1173#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001174 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001175 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001176 else if (strcmp(lower, "utf-16") == 0)
1177 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1178 else if (strcmp(lower, "utf-32") == 0)
1179 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180
1181 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001182 buffer = NULL;
1183 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1184 goto onError;
1185 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186 if (buffer == NULL)
1187 goto onError;
1188 unicode = PyCodec_Decode(buffer, encoding, errors);
1189 if (unicode == NULL)
1190 goto onError;
1191 if (!PyUnicode_Check(unicode)) {
1192 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001193 "decoder did not return an unicode object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001194 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 Py_DECREF(unicode);
1196 goto onError;
1197 }
1198 Py_DECREF(buffer);
1199 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001200
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201 onError:
1202 Py_XDECREF(buffer);
1203 return NULL;
1204}
1205
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001206PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1207 const char *encoding,
1208 const char *errors)
1209{
1210 PyObject *v;
1211
1212 if (!PyUnicode_Check(unicode)) {
1213 PyErr_BadArgument();
1214 goto onError;
1215 }
1216
1217 if (encoding == NULL)
1218 encoding = PyUnicode_GetDefaultEncoding();
1219
1220 /* Decode via the codec registry */
1221 v = PyCodec_Decode(unicode, encoding, errors);
1222 if (v == NULL)
1223 goto onError;
1224 return v;
1225
1226 onError:
1227 return NULL;
1228}
1229
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001231 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 const char *encoding,
1233 const char *errors)
1234{
1235 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001236
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237 unicode = PyUnicode_FromUnicode(s, size);
1238 if (unicode == NULL)
1239 return NULL;
1240 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1241 Py_DECREF(unicode);
1242 return v;
1243}
1244
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001245PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1246 const char *encoding,
1247 const char *errors)
1248{
1249 PyObject *v;
1250
1251 if (!PyUnicode_Check(unicode)) {
1252 PyErr_BadArgument();
1253 goto onError;
1254 }
1255
1256 if (encoding == NULL)
1257 encoding = PyUnicode_GetDefaultEncoding();
1258
1259 /* Encode via the codec registry */
1260 v = PyCodec_Encode(unicode, encoding, errors);
1261 if (v == NULL)
1262 goto onError;
1263 return v;
1264
1265 onError:
1266 return NULL;
1267}
1268
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1270 const char *encoding,
1271 const char *errors)
1272{
1273 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001274
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 if (!PyUnicode_Check(unicode)) {
1276 PyErr_BadArgument();
1277 goto onError;
1278 }
Fred Drakee4315f52000-05-09 19:53:39 +00001279
Tim Petersced69f82003-09-16 20:30:58 +00001280 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001281 encoding = PyUnicode_GetDefaultEncoding();
1282
1283 /* Shortcuts for common default encodings */
1284 if (errors == NULL) {
1285 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001286 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001287 else if (strcmp(encoding, "latin-1") == 0)
1288 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001289#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1290 else if (strcmp(encoding, "mbcs") == 0)
1291 return PyUnicode_AsMBCSString(unicode);
1292#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001293 else if (strcmp(encoding, "ascii") == 0)
1294 return PyUnicode_AsASCIIString(unicode);
1295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296
1297 /* Encode via the codec registry */
1298 v = PyCodec_Encode(unicode, encoding, errors);
1299 if (v == NULL)
1300 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001301 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001303
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 onError:
1305 return NULL;
1306}
1307
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001308PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1309 const char *errors)
1310{
1311 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001312 if (v)
1313 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001314 if (errors != NULL)
1315 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001316 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001317 PyUnicode_GET_SIZE(unicode),
1318 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001319 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001320 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001321 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001322 return v;
1323}
1324
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001325PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001326PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001327 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001328 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1329}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001330
Christian Heimes5894ba72007-11-04 11:43:14 +00001331PyObject*
1332PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1333{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001334 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1335 can be undefined. If it is case, decode using UTF-8. The following assumes
1336 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1337 bootstrapping process where the codecs aren't ready yet.
1338 */
1339 if (Py_FileSystemDefaultEncoding) {
1340#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001341 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001342 return PyUnicode_DecodeMBCS(s, size, "replace");
1343 }
1344#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001345 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001346 return PyUnicode_DecodeUTF8(s, size, "replace");
1347 }
1348#endif
1349 return PyUnicode_Decode(s, size,
1350 Py_FileSystemDefaultEncoding,
1351 "replace");
1352 }
1353 else {
1354 return PyUnicode_DecodeUTF8(s, size, "replace");
1355 }
1356}
1357
Martin v. Löwis5b222132007-06-10 09:51:05 +00001358char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001359PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001360{
Christian Heimesf3863112007-11-22 07:46:41 +00001361 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001362 if (!PyUnicode_Check(unicode)) {
1363 PyErr_BadArgument();
1364 return NULL;
1365 }
Christian Heimesf3863112007-11-22 07:46:41 +00001366 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1367 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001368 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001369 if (psize != NULL)
Christian Heimesf3863112007-11-22 07:46:41 +00001370 *psize = PyString_GET_SIZE(bytes);
1371 return PyString_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001372}
1373
1374char*
1375PyUnicode_AsString(PyObject *unicode)
1376{
1377 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001378}
1379
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1381{
1382 if (!PyUnicode_Check(unicode)) {
1383 PyErr_BadArgument();
1384 goto onError;
1385 }
1386 return PyUnicode_AS_UNICODE(unicode);
1387
1388 onError:
1389 return NULL;
1390}
1391
Martin v. Löwis18e16552006-02-15 17:27:45 +00001392Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393{
1394 if (!PyUnicode_Check(unicode)) {
1395 PyErr_BadArgument();
1396 goto onError;
1397 }
1398 return PyUnicode_GET_SIZE(unicode);
1399
1400 onError:
1401 return -1;
1402}
1403
Thomas Wouters78890102000-07-22 19:25:51 +00001404const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001405{
1406 return unicode_default_encoding;
1407}
1408
1409int PyUnicode_SetDefaultEncoding(const char *encoding)
1410{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001411 if (strcmp(encoding, unicode_default_encoding) != 0) {
1412 PyErr_Format(PyExc_ValueError,
1413 "Can only set default encoding to %s",
1414 unicode_default_encoding);
1415 return -1;
1416 }
Fred Drakee4315f52000-05-09 19:53:39 +00001417 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001418}
1419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420/* error handling callback helper:
1421 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001422 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001423 and adjust various state variables.
1424 return 0 on success, -1 on error
1425*/
1426
1427static
1428int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1429 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001430 const char **input, const char **inend, Py_ssize_t *startinpos,
1431 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001434 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435
1436 PyObject *restuple = NULL;
1437 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001438 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001439 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001440 Py_ssize_t requiredsize;
1441 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001443 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001444 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 int res = -1;
1446
1447 if (*errorHandler == NULL) {
1448 *errorHandler = PyCodec_LookupError(errors);
1449 if (*errorHandler == NULL)
1450 goto onError;
1451 }
1452
1453 if (*exceptionObject == NULL) {
1454 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001455 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 if (*exceptionObject == NULL)
1457 goto onError;
1458 }
1459 else {
1460 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1461 goto onError;
1462 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1463 goto onError;
1464 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1465 goto onError;
1466 }
1467
1468 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1469 if (restuple == NULL)
1470 goto onError;
1471 if (!PyTuple_Check(restuple)) {
1472 PyErr_Format(PyExc_TypeError, &argparse[4]);
1473 goto onError;
1474 }
1475 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1476 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001477
1478 /* Copy back the bytes variables, which might have been modified by the
1479 callback */
1480 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1481 if (!inputobj)
1482 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001483 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001484 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1485 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001486 *input = PyString_AS_STRING(inputobj);
1487 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001488 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001489 /* we can DECREF safely, as the exception has another reference,
1490 so the object won't go away. */
1491 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001494 newpos = insize+newpos;
1495 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001496 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001497 goto onError;
1498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499
1500 /* need more space? (at least enough for what we
1501 have+the replacement+the rest of the string (starting
1502 at the new input position), so we won't have to check space
1503 when there are no errors in the rest of the string) */
1504 repptr = PyUnicode_AS_UNICODE(repunicode);
1505 repsize = PyUnicode_GET_SIZE(repunicode);
1506 requiredsize = *outpos + repsize + insize-newpos;
1507 if (requiredsize > outsize) {
1508 if (requiredsize<2*outsize)
1509 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001510 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001511 goto onError;
1512 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1513 }
1514 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001515 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516 Py_UNICODE_COPY(*outptr, repptr, repsize);
1517 *outptr += repsize;
1518 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001520 /* we made it! */
1521 res = 0;
1522
1523 onError:
1524 Py_XDECREF(restuple);
1525 return res;
1526}
1527
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001528/* --- UTF-7 Codec -------------------------------------------------------- */
1529
1530/* see RFC2152 for details */
1531
Tim Petersced69f82003-09-16 20:30:58 +00001532static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533char utf7_special[128] = {
1534 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1535 encoded:
1536 0 - not special
1537 1 - special
1538 2 - whitespace (optional)
1539 3 - RFC2152 Set O (optional) */
1540 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1542 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1544 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1546 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1548
1549};
1550
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001551/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1552 warnings about the comparison always being false; since
1553 utf7_special[0] is 1, we can safely make that one comparison
1554 true */
1555
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001557 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001558 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559 (encodeO && (utf7_special[(c)] == 3)))
1560
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001561#define B64(n) \
1562 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1563#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001564 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001565#define UB64(c) \
1566 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1567 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001569#define ENCODE(out, ch, bits) \
1570 while (bits >= 6) { \
1571 *out++ = B64(ch >> (bits-6)); \
1572 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 }
1574
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001575#define DECODE(out, ch, bits, surrogate) \
1576 while (bits >= 16) { \
1577 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1578 bits -= 16; \
1579 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001580 /* We have already generated an error for the high surrogate \
1581 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001582 surrogate = 0; \
1583 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001585 it in a 16-bit character */ \
1586 surrogate = 1; \
1587 errmsg = "code pairs are not supported"; \
1588 goto utf7Error; \
1589 } else { \
1590 *out++ = outCh; \
1591 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001592 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001595 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001596 const char *errors)
1597{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001598 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1599}
1600
1601PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1602 Py_ssize_t size,
1603 const char *errors,
1604 Py_ssize_t *consumed)
1605{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001607 Py_ssize_t startinpos;
1608 Py_ssize_t endinpos;
1609 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 const char *e;
1611 PyUnicodeObject *unicode;
1612 Py_UNICODE *p;
1613 const char *errmsg = "";
1614 int inShift = 0;
1615 unsigned int bitsleft = 0;
1616 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 int surrogate = 0;
1618 PyObject *errorHandler = NULL;
1619 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620
1621 unicode = _PyUnicode_New(size);
1622 if (!unicode)
1623 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001624 if (size == 0) {
1625 if (consumed)
1626 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001628 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629
1630 p = unicode->str;
1631 e = s + size;
1632
1633 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 Py_UNICODE ch;
1635 restart:
1636 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637
1638 if (inShift) {
1639 if ((ch == '-') || !B64CHAR(ch)) {
1640 inShift = 0;
1641 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001642
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001643 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1644 if (bitsleft >= 6) {
1645 /* The shift sequence has a partial character in it. If
1646 bitsleft < 6 then we could just classify it as padding
1647 but that is not the case here */
1648
1649 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001650 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 }
1652 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001653 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 here so indicate the potential of a misencoded character. */
1655
1656 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1657 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1658 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001659 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001660 }
1661
1662 if (ch == '-') {
1663 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001664 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001665 inShift = 1;
1666 }
1667 } else if (SPECIAL(ch,0,0)) {
1668 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001669 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 } else {
1671 *p++ = ch;
1672 }
1673 } else {
1674 charsleft = (charsleft << 6) | UB64(ch);
1675 bitsleft += 6;
1676 s++;
1677 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1678 }
1679 }
1680 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001681 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 s++;
1683 if (s < e && *s == '-') {
1684 s++;
1685 *p++ = '+';
1686 } else
1687 {
1688 inShift = 1;
1689 bitsleft = 0;
1690 }
1691 }
1692 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001693 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001694 errmsg = "unexpected special character";
1695 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001696 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001697 }
1698 else {
1699 *p++ = ch;
1700 s++;
1701 }
1702 continue;
1703 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 outpos = p-PyUnicode_AS_UNICODE(unicode);
1705 endinpos = s-starts;
1706 if (unicode_decode_call_errorhandler(
1707 errors, &errorHandler,
1708 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001709 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001710 (PyObject **)&unicode, &outpos, &p))
1711 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001712 }
1713
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001714 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 outpos = p-PyUnicode_AS_UNICODE(unicode);
1716 endinpos = size;
1717 if (unicode_decode_call_errorhandler(
1718 errors, &errorHandler,
1719 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001720 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001722 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 if (s < e)
1724 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001726 if (consumed) {
1727 if(inShift)
1728 *consumed = startinpos;
1729 else
1730 *consumed = s-starts;
1731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001733 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001734 goto onError;
1735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001736 Py_XDECREF(errorHandler);
1737 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738 return (PyObject *)unicode;
1739
1740onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001741 Py_XDECREF(errorHandler);
1742 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001743 Py_DECREF(unicode);
1744 return NULL;
1745}
1746
1747
1748PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001749 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 int encodeSetO,
1751 int encodeWhiteSpace,
1752 const char *errors)
1753{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001754 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001756 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001758 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759 unsigned int bitsleft = 0;
1760 unsigned long charsleft = 0;
1761 char * out;
1762 char * start;
1763
1764 if (size == 0)
Christian Heimesf3863112007-11-22 07:46:41 +00001765 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001766
Walter Dörwald51ab4142007-05-05 14:43:36 +00001767 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001768 if (v == NULL)
1769 return NULL;
1770
Walter Dörwald51ab4142007-05-05 14:43:36 +00001771 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001772 for (;i < size; ++i) {
1773 Py_UNICODE ch = s[i];
1774
1775 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001776 if (ch == '+') {
1777 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001778 *out++ = '-';
1779 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1780 charsleft = ch;
1781 bitsleft = 16;
1782 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001783 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001784 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001785 } else {
1786 *out++ = (char) ch;
1787 }
1788 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001789 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1790 *out++ = B64(charsleft << (6-bitsleft));
1791 charsleft = 0;
1792 bitsleft = 0;
1793 /* Characters not in the BASE64 set implicitly unshift the sequence
1794 so no '-' is required, except if the character is itself a '-' */
1795 if (B64CHAR(ch) || ch == '-') {
1796 *out++ = '-';
1797 }
1798 inShift = 0;
1799 *out++ = (char) ch;
1800 } else {
1801 bitsleft += 16;
1802 charsleft = (charsleft << 16) | ch;
1803 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1804
1805 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001806 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001807 or '-' then the shift sequence will be terminated implicitly and we
1808 don't have to insert a '-'. */
1809
1810 if (bitsleft == 0) {
1811 if (i + 1 < size) {
1812 Py_UNICODE ch2 = s[i+1];
1813
1814 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001815
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 } else if (B64CHAR(ch2) || ch2 == '-') {
1817 *out++ = '-';
1818 inShift = 0;
1819 } else {
1820 inShift = 0;
1821 }
1822
1823 }
1824 else {
1825 *out++ = '-';
1826 inShift = 0;
1827 }
1828 }
Tim Petersced69f82003-09-16 20:30:58 +00001829 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001831 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001832 if (bitsleft) {
1833 *out++= B64(charsleft << (6-bitsleft) );
1834 *out++ = '-';
1835 }
1836
Guido van Rossum98297ee2007-11-06 21:34:58 +00001837 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1838 Py_DECREF(v);
1839 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840}
1841
1842#undef SPECIAL
1843#undef B64
1844#undef B64CHAR
1845#undef UB64
1846#undef ENCODE
1847#undef DECODE
1848
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849/* --- UTF-8 Codec -------------------------------------------------------- */
1850
Tim Petersced69f82003-09-16 20:30:58 +00001851static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852char utf8_code_length[256] = {
1853 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1854 illegal prefix. see RFC 2279 for details */
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1869 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1870 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1871};
1872
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001874 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 const char *errors)
1876{
Walter Dörwald69652032004-09-07 20:24:22 +00001877 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1878}
1879
1880PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001881 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001882 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001883 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001884{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001887 Py_ssize_t startinpos;
1888 Py_ssize_t endinpos;
1889 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890 const char *e;
1891 PyUnicodeObject *unicode;
1892 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 PyObject *errorHandler = NULL;
1895 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896
1897 /* Note: size will always be longer than the resulting Unicode
1898 character count */
1899 unicode = _PyUnicode_New(size);
1900 if (!unicode)
1901 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001902 if (size == 0) {
1903 if (consumed)
1904 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907
1908 /* Unpack UTF-8 encoded data */
1909 p = unicode->str;
1910 e = s + size;
1911
1912 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001913 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001914
1915 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 s++;
1918 continue;
1919 }
1920
1921 n = utf8_code_length[ch];
1922
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001923 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001924 if (consumed)
1925 break;
1926 else {
1927 errmsg = "unexpected end of data";
1928 startinpos = s-starts;
1929 endinpos = size;
1930 goto utf8Error;
1931 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933
1934 switch (n) {
1935
1936 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 startinpos = s-starts;
1939 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001940 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941
1942 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001943 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001944 startinpos = s-starts;
1945 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001946 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947
1948 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001949 if ((s[1] & 0xc0) != 0x80) {
1950 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001951 startinpos = s-starts;
1952 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001953 goto utf8Error;
1954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001956 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001957 startinpos = s-starts;
1958 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 errmsg = "illegal encoding";
1960 goto utf8Error;
1961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001963 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 break;
1965
1966 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001967 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001968 (s[2] & 0xc0) != 0x80) {
1969 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001970 startinpos = s-starts;
1971 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001972 goto utf8Error;
1973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001975 if (ch < 0x0800) {
1976 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001977 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001978
1979 XXX For wide builds (UCS-4) we should probably try
1980 to recombine the surrogates into a single code
1981 unit.
1982 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001983 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001984 startinpos = s-starts;
1985 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001986 goto utf8Error;
1987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001989 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001990 break;
1991
1992 case 4:
1993 if ((s[1] & 0xc0) != 0x80 ||
1994 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001995 (s[3] & 0xc0) != 0x80) {
1996 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001997 startinpos = s-starts;
1998 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001999 goto utf8Error;
2000 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002001 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2002 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2003 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002004 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002005 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002006 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002007 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002008 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002009 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002010 startinpos = s-starts;
2011 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002012 goto utf8Error;
2013 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002014#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002015 *p++ = (Py_UNICODE)ch;
2016#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002018
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002019 /* translate from 10000..10FFFF to 0..FFFF */
2020 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002021
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002022 /* high surrogate = top 10 bits added to D800 */
2023 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002024
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002025 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002026 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002027#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 break;
2029
2030 default:
2031 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002032 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002033 startinpos = s-starts;
2034 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002035 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 }
2037 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002038 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002040 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041 outpos = p-PyUnicode_AS_UNICODE(unicode);
2042 if (unicode_decode_call_errorhandler(
2043 errors, &errorHandler,
2044 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002045 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002046 (PyObject **)&unicode, &outpos, &p))
2047 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald69652032004-09-07 20:24:22 +00002049 if (consumed)
2050 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002053 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 goto onError;
2055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 Py_XDECREF(errorHandler);
2057 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 return (PyObject *)unicode;
2059
2060onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 Py_XDECREF(errorHandler);
2062 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 Py_DECREF(unicode);
2064 return NULL;
2065}
2066
Tim Peters602f7402002-04-27 18:03:26 +00002067/* Allocation strategy: if the string is short, convert into a stack buffer
2068 and allocate exactly as much space needed at the end. Else allocate the
2069 maximum possible needed (4 result bytes per Unicode character), and return
2070 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002071*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002072PyObject *
2073PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002074 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002075 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076{
Tim Peters602f7402002-04-27 18:03:26 +00002077#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002078
Guido van Rossum98297ee2007-11-06 21:34:58 +00002079 Py_ssize_t i; /* index into s of next input byte */
2080 PyObject *result; /* result string object */
2081 char *p; /* next free byte in output buffer */
2082 Py_ssize_t nallocated; /* number of result bytes allocated */
2083 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002084 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085
Tim Peters602f7402002-04-27 18:03:26 +00002086 assert(s != NULL);
2087 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088
Tim Peters602f7402002-04-27 18:03:26 +00002089 if (size <= MAX_SHORT_UNICHARS) {
2090 /* Write into the stack buffer; nallocated can't overflow.
2091 * At the end, we'll allocate exactly as much heap space as it
2092 * turns out we need.
2093 */
2094 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002095 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002096 p = stackbuf;
2097 }
2098 else {
2099 /* Overallocate on the heap, and give the excess back at the end. */
2100 nallocated = size * 4;
2101 if (nallocated / 4 != size) /* overflow! */
2102 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002103 result = PyString_FromStringAndSize(NULL, nallocated);
2104 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002105 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002106 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002107 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002108
Tim Peters602f7402002-04-27 18:03:26 +00002109 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002110 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002111
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002112 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002113 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002117 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002118 *p++ = (char)(0xc0 | (ch >> 6));
2119 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121 else {
Tim Peters602f7402002-04-27 18:03:26 +00002122 /* Encode UCS2 Unicode ordinals */
2123 if (ch < 0x10000) {
2124 /* Special case: check for high surrogate */
2125 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2126 Py_UCS4 ch2 = s[i];
2127 /* Check for low surrogate and combine the two to
2128 form a UCS4 value */
2129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002130 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002131 i++;
2132 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 }
Tim Peters602f7402002-04-27 18:03:26 +00002134 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002135 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002136 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002137 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2138 *p++ = (char)(0x80 | (ch & 0x3f));
2139 continue;
2140 }
2141encodeUCS4:
2142 /* Encode UCS4 Unicode ordinals */
2143 *p++ = (char)(0xf0 | (ch >> 18));
2144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2146 *p++ = (char)(0x80 | (ch & 0x3f));
2147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002149
Guido van Rossum98297ee2007-11-06 21:34:58 +00002150 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002151 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002152 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002153 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002154 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002155 }
2156 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002157 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002158 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002160 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002161 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002162 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002163
Tim Peters602f7402002-04-27 18:03:26 +00002164#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165}
2166
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2168{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 if (!PyUnicode_Check(unicode)) {
2170 PyErr_BadArgument();
2171 return NULL;
2172 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002173 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2174 PyUnicode_GET_SIZE(unicode),
2175 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176}
2177
Walter Dörwald41980ca2007-08-16 21:55:45 +00002178/* --- UTF-32 Codec ------------------------------------------------------- */
2179
2180PyObject *
2181PyUnicode_DecodeUTF32(const char *s,
2182 Py_ssize_t size,
2183 const char *errors,
2184 int *byteorder)
2185{
2186 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2187}
2188
2189PyObject *
2190PyUnicode_DecodeUTF32Stateful(const char *s,
2191 Py_ssize_t size,
2192 const char *errors,
2193 int *byteorder,
2194 Py_ssize_t *consumed)
2195{
2196 const char *starts = s;
2197 Py_ssize_t startinpos;
2198 Py_ssize_t endinpos;
2199 Py_ssize_t outpos;
2200 PyUnicodeObject *unicode;
2201 Py_UNICODE *p;
2202#ifndef Py_UNICODE_WIDE
2203 int i, pairs;
2204#else
2205 const int pairs = 0;
2206#endif
2207 const unsigned char *q, *e;
2208 int bo = 0; /* assume native ordering by default */
2209 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002210 /* Offsets from q for retrieving bytes in the right order. */
2211#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2212 int iorder[] = {0, 1, 2, 3};
2213#else
2214 int iorder[] = {3, 2, 1, 0};
2215#endif
2216 PyObject *errorHandler = NULL;
2217 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002218 /* On narrow builds we split characters outside the BMP into two
2219 codepoints => count how much extra space we need. */
2220#ifndef Py_UNICODE_WIDE
2221 for (i = pairs = 0; i < size/4; i++)
2222 if (((Py_UCS4 *)s)[i] >= 0x10000)
2223 pairs++;
2224#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002225
2226 /* This might be one to much, because of a BOM */
2227 unicode = _PyUnicode_New((size+3)/4+pairs);
2228 if (!unicode)
2229 return NULL;
2230 if (size == 0)
2231 return (PyObject *)unicode;
2232
2233 /* Unpack UTF-32 encoded data */
2234 p = unicode->str;
2235 q = (unsigned char *)s;
2236 e = q + size;
2237
2238 if (byteorder)
2239 bo = *byteorder;
2240
2241 /* Check for BOM marks (U+FEFF) in the input and adjust current
2242 byte order setting accordingly. In native mode, the leading BOM
2243 mark is skipped, in all other modes, it is copied to the output
2244 stream as-is (giving a ZWNBSP character). */
2245 if (bo == 0) {
2246 if (size >= 4) {
2247 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2248 (q[iorder[1]] << 8) | q[iorder[0]];
2249#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2250 if (bom == 0x0000FEFF) {
2251 q += 4;
2252 bo = -1;
2253 }
2254 else if (bom == 0xFFFE0000) {
2255 q += 4;
2256 bo = 1;
2257 }
2258#else
2259 if (bom == 0x0000FEFF) {
2260 q += 4;
2261 bo = 1;
2262 }
2263 else if (bom == 0xFFFE0000) {
2264 q += 4;
2265 bo = -1;
2266 }
2267#endif
2268 }
2269 }
2270
2271 if (bo == -1) {
2272 /* force LE */
2273 iorder[0] = 0;
2274 iorder[1] = 1;
2275 iorder[2] = 2;
2276 iorder[3] = 3;
2277 }
2278 else if (bo == 1) {
2279 /* force BE */
2280 iorder[0] = 3;
2281 iorder[1] = 2;
2282 iorder[2] = 1;
2283 iorder[3] = 0;
2284 }
2285
2286 while (q < e) {
2287 Py_UCS4 ch;
2288 /* remaining bytes at the end? (size should be divisible by 4) */
2289 if (e-q<4) {
2290 if (consumed)
2291 break;
2292 errmsg = "truncated data";
2293 startinpos = ((const char *)q)-starts;
2294 endinpos = ((const char *)e)-starts;
2295 goto utf32Error;
2296 /* The remaining input chars are ignored if the callback
2297 chooses to skip the input */
2298 }
2299 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2300 (q[iorder[1]] << 8) | q[iorder[0]];
2301
2302 if (ch >= 0x110000)
2303 {
2304 errmsg = "codepoint not in range(0x110000)";
2305 startinpos = ((const char *)q)-starts;
2306 endinpos = startinpos+4;
2307 goto utf32Error;
2308 }
2309#ifndef Py_UNICODE_WIDE
2310 if (ch >= 0x10000)
2311 {
2312 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2313 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2314 }
2315 else
2316#endif
2317 *p++ = ch;
2318 q += 4;
2319 continue;
2320 utf32Error:
2321 outpos = p-PyUnicode_AS_UNICODE(unicode);
2322 if (unicode_decode_call_errorhandler(
2323 errors, &errorHandler,
2324 "utf32", errmsg,
2325 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2326 (PyObject **)&unicode, &outpos, &p))
2327 goto onError;
2328 }
2329
2330 if (byteorder)
2331 *byteorder = bo;
2332
2333 if (consumed)
2334 *consumed = (const char *)q-starts;
2335
2336 /* Adjust length */
2337 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2338 goto onError;
2339
2340 Py_XDECREF(errorHandler);
2341 Py_XDECREF(exc);
2342 return (PyObject *)unicode;
2343
2344onError:
2345 Py_DECREF(unicode);
2346 Py_XDECREF(errorHandler);
2347 Py_XDECREF(exc);
2348 return NULL;
2349}
2350
2351PyObject *
2352PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2353 Py_ssize_t size,
2354 const char *errors,
2355 int byteorder)
2356{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002357 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002358 unsigned char *p;
2359#ifndef Py_UNICODE_WIDE
2360 int i, pairs;
2361#else
2362 const int pairs = 0;
2363#endif
2364 /* Offsets from p for storing byte pairs in the right order. */
2365#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366 int iorder[] = {0, 1, 2, 3};
2367#else
2368 int iorder[] = {3, 2, 1, 0};
2369#endif
2370
2371#define STORECHAR(CH) \
2372 do { \
2373 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2374 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2375 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2376 p[iorder[0]] = (CH) & 0xff; \
2377 p += 4; \
2378 } while(0)
2379
2380 /* In narrow builds we can output surrogate pairs as one codepoint,
2381 so we need less space. */
2382#ifndef Py_UNICODE_WIDE
2383 for (i = pairs = 0; i < size-1; i++)
2384 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386 pairs++;
2387#endif
2388 v = PyBytes_FromStringAndSize(NULL,
2389 4 * (size - pairs + (byteorder == 0)));
2390 if (v == NULL)
2391 return NULL;
2392
2393 p = (unsigned char *)PyBytes_AS_STRING(v);
2394 if (byteorder == 0)
2395 STORECHAR(0xFEFF);
2396 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002397 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002398
2399 if (byteorder == -1) {
2400 /* force LE */
2401 iorder[0] = 0;
2402 iorder[1] = 1;
2403 iorder[2] = 2;
2404 iorder[3] = 3;
2405 }
2406 else if (byteorder == 1) {
2407 /* force BE */
2408 iorder[0] = 3;
2409 iorder[1] = 2;
2410 iorder[2] = 1;
2411 iorder[3] = 0;
2412 }
2413
2414 while (size-- > 0) {
2415 Py_UCS4 ch = *s++;
2416#ifndef Py_UNICODE_WIDE
2417 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2418 Py_UCS4 ch2 = *s;
2419 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2420 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2421 s++;
2422 size--;
2423 }
2424 }
2425#endif
2426 STORECHAR(ch);
2427 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002428
2429 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002430 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002431 Py_DECREF(v);
2432 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002433#undef STORECHAR
2434}
2435
2436PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2437{
2438 if (!PyUnicode_Check(unicode)) {
2439 PyErr_BadArgument();
2440 return NULL;
2441 }
2442 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2443 PyUnicode_GET_SIZE(unicode),
2444 NULL,
2445 0);
2446}
2447
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448/* --- UTF-16 Codec ------------------------------------------------------- */
2449
Tim Peters772747b2001-08-09 22:21:55 +00002450PyObject *
2451PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002452 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002453 const char *errors,
2454 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455{
Walter Dörwald69652032004-09-07 20:24:22 +00002456 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2457}
2458
2459PyObject *
2460PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002461 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002462 const char *errors,
2463 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002464 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002465{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002467 Py_ssize_t startinpos;
2468 Py_ssize_t endinpos;
2469 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470 PyUnicodeObject *unicode;
2471 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002472 const unsigned char *q, *e;
2473 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002474 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002475 /* Offsets from q for retrieving byte pairs in the right order. */
2476#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2477 int ihi = 1, ilo = 0;
2478#else
2479 int ihi = 0, ilo = 1;
2480#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 PyObject *errorHandler = NULL;
2482 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483
2484 /* Note: size will always be longer than the resulting Unicode
2485 character count */
2486 unicode = _PyUnicode_New(size);
2487 if (!unicode)
2488 return NULL;
2489 if (size == 0)
2490 return (PyObject *)unicode;
2491
2492 /* Unpack UTF-16 encoded data */
2493 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002494 q = (unsigned char *)s;
2495 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496
2497 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002498 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002500 /* Check for BOM marks (U+FEFF) in the input and adjust current
2501 byte order setting accordingly. In native mode, the leading BOM
2502 mark is skipped, in all other modes, it is copied to the output
2503 stream as-is (giving a ZWNBSP character). */
2504 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002505 if (size >= 2) {
2506 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002507#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002508 if (bom == 0xFEFF) {
2509 q += 2;
2510 bo = -1;
2511 }
2512 else if (bom == 0xFFFE) {
2513 q += 2;
2514 bo = 1;
2515 }
Tim Petersced69f82003-09-16 20:30:58 +00002516#else
Walter Dörwald69652032004-09-07 20:24:22 +00002517 if (bom == 0xFEFF) {
2518 q += 2;
2519 bo = 1;
2520 }
2521 else if (bom == 0xFFFE) {
2522 q += 2;
2523 bo = -1;
2524 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002525#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002526 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528
Tim Peters772747b2001-08-09 22:21:55 +00002529 if (bo == -1) {
2530 /* force LE */
2531 ihi = 1;
2532 ilo = 0;
2533 }
2534 else if (bo == 1) {
2535 /* force BE */
2536 ihi = 0;
2537 ilo = 1;
2538 }
2539
2540 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002542 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002544 if (consumed)
2545 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002546 errmsg = "truncated data";
2547 startinpos = ((const char *)q)-starts;
2548 endinpos = ((const char *)e)-starts;
2549 goto utf16Error;
2550 /* The remaining input chars are ignored if the callback
2551 chooses to skip the input */
2552 }
2553 ch = (q[ihi] << 8) | q[ilo];
2554
Tim Peters772747b2001-08-09 22:21:55 +00002555 q += 2;
2556
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 if (ch < 0xD800 || ch > 0xDFFF) {
2558 *p++ = ch;
2559 continue;
2560 }
2561
2562 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002563 if (q >= e) {
2564 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002565 startinpos = (((const char *)q)-2)-starts;
2566 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002567 goto utf16Error;
2568 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002569 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002570 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2571 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002572 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002573#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002574 *p++ = ch;
2575 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002576#else
2577 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002578#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002579 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002580 }
2581 else {
2582 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 startinpos = (((const char *)q)-4)-starts;
2584 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002585 goto utf16Error;
2586 }
2587
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002589 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590 startinpos = (((const char *)q)-2)-starts;
2591 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002592 /* Fall through to report the error */
2593
2594 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 outpos = p-PyUnicode_AS_UNICODE(unicode);
2596 if (unicode_decode_call_errorhandler(
2597 errors, &errorHandler,
2598 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002599 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002601 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 }
2603
2604 if (byteorder)
2605 *byteorder = bo;
2606
Walter Dörwald69652032004-09-07 20:24:22 +00002607 if (consumed)
2608 *consumed = (const char *)q-starts;
2609
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002611 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 goto onError;
2613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614 Py_XDECREF(errorHandler);
2615 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 return (PyObject *)unicode;
2617
2618onError:
2619 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002620 Py_XDECREF(errorHandler);
2621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 return NULL;
2623}
2624
Tim Peters772747b2001-08-09 22:21:55 +00002625PyObject *
2626PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002627 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002628 const char *errors,
2629 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002631 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002632 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002633#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002634 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002635#else
2636 const int pairs = 0;
2637#endif
Tim Peters772747b2001-08-09 22:21:55 +00002638 /* Offsets from p for storing byte pairs in the right order. */
2639#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640 int ihi = 1, ilo = 0;
2641#else
2642 int ihi = 0, ilo = 1;
2643#endif
2644
2645#define STORECHAR(CH) \
2646 do { \
2647 p[ihi] = ((CH) >> 8) & 0xff; \
2648 p[ilo] = (CH) & 0xff; \
2649 p += 2; \
2650 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002652#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002653 for (i = pairs = 0; i < size; i++)
2654 if (s[i] >= 0x10000)
2655 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002656#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002657 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002658 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 if (v == NULL)
2660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661
Walter Dörwald3cc34522007-05-04 10:48:27 +00002662 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002664 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002665 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002666 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002667
2668 if (byteorder == -1) {
2669 /* force LE */
2670 ihi = 1;
2671 ilo = 0;
2672 }
2673 else if (byteorder == 1) {
2674 /* force BE */
2675 ihi = 0;
2676 ilo = 1;
2677 }
2678
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002679 while (size-- > 0) {
2680 Py_UNICODE ch = *s++;
2681 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002682#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002683 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002684 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2685 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002687#endif
Tim Peters772747b2001-08-09 22:21:55 +00002688 STORECHAR(ch);
2689 if (ch2)
2690 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002691 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002692
2693 done:
Christian Heimes90aa7642007-12-19 02:45:37 +00002694 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002695 Py_DECREF(v);
2696 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002697#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698}
2699
2700PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2701{
2702 if (!PyUnicode_Check(unicode)) {
2703 PyErr_BadArgument();
2704 return NULL;
2705 }
2706 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2707 PyUnicode_GET_SIZE(unicode),
2708 NULL,
2709 0);
2710}
2711
2712/* --- Unicode Escape Codec ----------------------------------------------- */
2713
Fredrik Lundh06d12682001-01-24 07:59:11 +00002714static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002715
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002717 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 const char *errors)
2719{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002721 Py_ssize_t startinpos;
2722 Py_ssize_t endinpos;
2723 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002728 char* message;
2729 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002730 PyObject *errorHandler = NULL;
2731 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002732
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 /* Escaped strings will always be longer than the resulting
2734 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 length after conversion to the true value.
2736 (but if the error callback returns a long replacement string
2737 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 v = _PyUnicode_New(size);
2739 if (v == NULL)
2740 goto onError;
2741 if (size == 0)
2742 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 while (s < end) {
2748 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002749 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002750 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751
2752 /* Non-escape characters are interpreted as Unicode ordinals */
2753 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 continue;
2756 }
2757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 /* \ - Escapes */
2760 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002761 c = *s++;
2762 if (s > end)
2763 c = '\0'; /* Invalid after \ */
2764 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765
2766 /* \x escapes */
2767 case '\n': break;
2768 case '\\': *p++ = '\\'; break;
2769 case '\'': *p++ = '\''; break;
2770 case '\"': *p++ = '\"'; break;
2771 case 'b': *p++ = '\b'; break;
2772 case 'f': *p++ = '\014'; break; /* FF */
2773 case 't': *p++ = '\t'; break;
2774 case 'n': *p++ = '\n'; break;
2775 case 'r': *p++ = '\r'; break;
2776 case 'v': *p++ = '\013'; break; /* VT */
2777 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2778
2779 /* \OOO (octal) escapes */
2780 case '0': case '1': case '2': case '3':
2781 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002782 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002783 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002784 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002785 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002786 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002788 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 break;
2790
Fredrik Lundhccc74732001-02-18 22:13:49 +00002791 /* hex escapes */
2792 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002794 digits = 2;
2795 message = "truncated \\xXX escape";
2796 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797
Fredrik Lundhccc74732001-02-18 22:13:49 +00002798 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002800 digits = 4;
2801 message = "truncated \\uXXXX escape";
2802 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803
Fredrik Lundhccc74732001-02-18 22:13:49 +00002804 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002805 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002806 digits = 8;
2807 message = "truncated \\UXXXXXXXX escape";
2808 hexescape:
2809 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 outpos = p-PyUnicode_AS_UNICODE(v);
2811 if (s+digits>end) {
2812 endinpos = size;
2813 if (unicode_decode_call_errorhandler(
2814 errors, &errorHandler,
2815 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002816 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 (PyObject **)&v, &outpos, &p))
2818 goto onError;
2819 goto nextByte;
2820 }
2821 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002823 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 endinpos = (s+i+1)-starts;
2825 if (unicode_decode_call_errorhandler(
2826 errors, &errorHandler,
2827 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002828 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002830 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002832 }
2833 chr = (chr<<4) & ~0xF;
2834 if (c >= '0' && c <= '9')
2835 chr += c - '0';
2836 else if (c >= 'a' && c <= 'f')
2837 chr += 10 + c - 'a';
2838 else
2839 chr += 10 + c - 'A';
2840 }
2841 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002842 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843 /* _decoding_error will have already written into the
2844 target buffer. */
2845 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002846 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002847 /* when we get here, chr is a 32-bit unicode character */
2848 if (chr <= 0xffff)
2849 /* UCS-2 character */
2850 *p++ = (Py_UNICODE) chr;
2851 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002852 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002853 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002854#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002855 *p++ = chr;
2856#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002857 chr -= 0x10000L;
2858 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002859 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002860#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002861 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002862 endinpos = s-starts;
2863 outpos = p-PyUnicode_AS_UNICODE(v);
2864 if (unicode_decode_call_errorhandler(
2865 errors, &errorHandler,
2866 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002867 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002868 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002869 goto onError;
2870 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002871 break;
2872
2873 /* \N{name} */
2874 case 'N':
2875 message = "malformed \\N character escape";
2876 if (ucnhash_CAPI == NULL) {
2877 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002878 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002879 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002880 if (m == NULL)
2881 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002882 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002884 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002885 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002886 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002887 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 if (ucnhash_CAPI == NULL)
2889 goto ucnhashError;
2890 }
2891 if (*s == '{') {
2892 const char *start = s+1;
2893 /* look for the closing brace */
2894 while (*s != '}' && s < end)
2895 s++;
2896 if (s > start && s < end && *s == '}') {
2897 /* found a name. look it up in the unicode database */
2898 message = "unknown Unicode character name";
2899 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002900 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002901 goto store;
2902 }
2903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904 endinpos = s-starts;
2905 outpos = p-PyUnicode_AS_UNICODE(v);
2906 if (unicode_decode_call_errorhandler(
2907 errors, &errorHandler,
2908 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002909 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002911 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002912 break;
2913
2914 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002915 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916 message = "\\ at end of string";
2917 s--;
2918 endinpos = s-starts;
2919 outpos = p-PyUnicode_AS_UNICODE(v);
2920 if (unicode_decode_call_errorhandler(
2921 errors, &errorHandler,
2922 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002923 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002924 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002925 goto onError;
2926 }
2927 else {
2928 *p++ = '\\';
2929 *p++ = (unsigned char)s[-1];
2930 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002931 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 nextByte:
2934 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002936 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002938 Py_XDECREF(errorHandler);
2939 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002941
Fredrik Lundhccc74732001-02-18 22:13:49 +00002942ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002943 PyErr_SetString(
2944 PyExc_UnicodeError,
2945 "\\N escapes not supported (can't load unicodedata module)"
2946 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002947 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002948 Py_XDECREF(errorHandler);
2949 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002950 return NULL;
2951
Fredrik Lundhccc74732001-02-18 22:13:49 +00002952onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002954 Py_XDECREF(errorHandler);
2955 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 return NULL;
2957}
2958
2959/* Return a Unicode-Escape string version of the Unicode object.
2960
2961 If quotes is true, the string is enclosed in u"" or u'' quotes as
2962 appropriate.
2963
2964*/
2965
Thomas Wouters477c8d52006-05-27 19:21:47 +00002966Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2967 Py_ssize_t size,
2968 Py_UNICODE ch)
2969{
2970 /* like wcschr, but doesn't stop at NULL characters */
2971
2972 while (size-- > 0) {
2973 if (*s == ch)
2974 return s;
2975 s++;
2976 }
2977
2978 return NULL;
2979}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002980
Walter Dörwald79e913e2007-05-12 11:08:06 +00002981static const char *hexdigits = "0123456789abcdef";
2982
2983PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2984 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002986 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988
Thomas Wouters89f507f2006-12-13 04:49:30 +00002989 /* XXX(nnorwitz): rather than over-allocating, it would be
2990 better to choose a different scheme. Perhaps scan the
2991 first N-chars of the string and allocate based on that size.
2992 */
2993 /* Initial allocation is based on the longest-possible unichr
2994 escape.
2995
2996 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2997 unichr, so in this case it's the longest unichr escape. In
2998 narrow (UTF-16) builds this is five chars per source unichr
2999 since there are two unichrs in the surrogate pair, so in narrow
3000 (UTF-16) builds it's not the longest unichr escape.
3001
3002 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3003 so in the narrow (UTF-16) build case it's the longest unichr
3004 escape.
3005 */
3006
Walter Dörwald79e913e2007-05-12 11:08:06 +00003007 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00003008#ifdef Py_UNICODE_WIDE
3009 + 10*size
3010#else
3011 + 6*size
3012#endif
3013 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 if (repr == NULL)
3015 return NULL;
3016
Walter Dörwald79e913e2007-05-12 11:08:06 +00003017 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 while (size-- > 0) {
3020 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003021
Walter Dörwald79e913e2007-05-12 11:08:06 +00003022 /* Escape backslashes */
3023 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 *p++ = '\\';
3025 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003026 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003027 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003028
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003029#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003030 /* Map 21-bit characters to '\U00xxxxxx' */
3031 else if (ch >= 0x10000) {
3032 *p++ = '\\';
3033 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003034 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3035 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3036 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3037 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3038 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3039 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3040 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3041 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003042 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003043 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003044#else
3045 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003046 else if (ch >= 0xD800 && ch < 0xDC00) {
3047 Py_UNICODE ch2;
3048 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003049
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003050 ch2 = *s++;
3051 size--;
3052 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3053 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3054 *p++ = '\\';
3055 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003056 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3057 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3058 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3059 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3060 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3061 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3062 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3063 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003064 continue;
3065 }
3066 /* Fall through: isolated surrogates are copied as-is */
3067 s--;
3068 size++;
3069 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003070#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003071
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003073 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 *p++ = '\\';
3075 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003076 *p++ = hexdigits[(ch >> 12) & 0x000F];
3077 *p++ = hexdigits[(ch >> 8) & 0x000F];
3078 *p++ = hexdigits[(ch >> 4) & 0x000F];
3079 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003081
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003082 /* Map special whitespace to '\t', \n', '\r' */
3083 else if (ch == '\t') {
3084 *p++ = '\\';
3085 *p++ = 't';
3086 }
3087 else if (ch == '\n') {
3088 *p++ = '\\';
3089 *p++ = 'n';
3090 }
3091 else if (ch == '\r') {
3092 *p++ = '\\';
3093 *p++ = 'r';
3094 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003095
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003096 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003097 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003099 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003100 *p++ = hexdigits[(ch >> 4) & 0x000F];
3101 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003102 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003103
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 /* Copy everything else as-is */
3105 else
3106 *p++ = (char) ch;
3107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108
Guido van Rossum98297ee2007-11-06 21:34:58 +00003109 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3110 p - PyBytes_AS_STRING(repr));
3111 Py_DECREF(repr);
3112 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113}
3114
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3116{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003117 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118 if (!PyUnicode_Check(unicode)) {
3119 PyErr_BadArgument();
3120 return NULL;
3121 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003122 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3123 PyUnicode_GET_SIZE(unicode));
3124
3125 if (!s)
3126 return NULL;
3127 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3128 PyBytes_GET_SIZE(s));
3129 Py_DECREF(s);
3130 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131}
3132
3133/* --- Raw Unicode Escape Codec ------------------------------------------- */
3134
3135PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003136 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 const char *errors)
3138{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003139 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003140 Py_ssize_t startinpos;
3141 Py_ssize_t endinpos;
3142 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003144 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 const char *end;
3146 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 PyObject *errorHandler = NULL;
3148 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003149
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 /* Escaped strings will always be longer than the resulting
3151 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003152 length after conversion to the true value. (But decoding error
3153 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 v = _PyUnicode_New(size);
3155 if (v == NULL)
3156 goto onError;
3157 if (size == 0)
3158 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 end = s + size;
3161 while (s < end) {
3162 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003163 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003165 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166
3167 /* Non-escape characters are interpreted as Unicode ordinals */
3168 if (*s != '\\') {
3169 *p++ = (unsigned char)*s++;
3170 continue;
3171 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173
3174 /* \u-escapes are only interpreted iff the number of leading
3175 backslashes if odd */
3176 bs = s;
3177 for (;s < end;) {
3178 if (*s != '\\')
3179 break;
3180 *p++ = (unsigned char)*s++;
3181 }
3182 if (((s - bs) & 1) == 0 ||
3183 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003184 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 continue;
3186 }
3187 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003188 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 s++;
3190
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003191 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003193 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003195 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 endinpos = s-starts;
3197 if (unicode_decode_call_errorhandler(
3198 errors, &errorHandler,
3199 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003200 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003203 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 }
3205 x = (x<<4) & ~0xF;
3206 if (c >= '0' && c <= '9')
3207 x += c - '0';
3208 else if (c >= 'a' && c <= 'f')
3209 x += 10 + c - 'a';
3210 else
3211 x += 10 + c - 'A';
3212 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003213 if (x <= 0xffff)
3214 /* UCS-2 character */
3215 *p++ = (Py_UNICODE) x;
3216 else if (x <= 0x10ffff) {
3217 /* UCS-4 character. Either store directly, or as
3218 surrogate pair. */
3219#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003220 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003221#else
3222 x -= 0x10000L;
3223 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3224 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3225#endif
3226 } else {
3227 endinpos = s-starts;
3228 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003229 if (unicode_decode_call_errorhandler(
3230 errors, &errorHandler,
3231 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003232 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003233 (PyObject **)&v, &outpos, &p))
3234 goto onError;
3235 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 nextByte:
3237 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003239 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003240 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 Py_XDECREF(errorHandler);
3242 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003244
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 onError:
3246 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 Py_XDECREF(errorHandler);
3248 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 return NULL;
3250}
3251
3252PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003253 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003255 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 char *p;
3257 char *q;
3258
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003259#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003260 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003261#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003262 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003263#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 if (repr == NULL)
3265 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003266 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003267 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268
Walter Dörwald711005d2007-05-12 12:03:26 +00003269 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 while (size-- > 0) {
3271 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003272#ifdef Py_UNICODE_WIDE
3273 /* Map 32-bit characters to '\Uxxxxxxxx' */
3274 if (ch >= 0x10000) {
3275 *p++ = '\\';
3276 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003277 *p++ = hexdigits[(ch >> 28) & 0xf];
3278 *p++ = hexdigits[(ch >> 24) & 0xf];
3279 *p++ = hexdigits[(ch >> 20) & 0xf];
3280 *p++ = hexdigits[(ch >> 16) & 0xf];
3281 *p++ = hexdigits[(ch >> 12) & 0xf];
3282 *p++ = hexdigits[(ch >> 8) & 0xf];
3283 *p++ = hexdigits[(ch >> 4) & 0xf];
3284 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003285 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003286 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003287#else
3288 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3289 if (ch >= 0xD800 && ch < 0xDC00) {
3290 Py_UNICODE ch2;
3291 Py_UCS4 ucs;
3292
3293 ch2 = *s++;
3294 size--;
3295 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3296 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3297 *p++ = '\\';
3298 *p++ = 'U';
3299 *p++ = hexdigits[(ucs >> 28) & 0xf];
3300 *p++ = hexdigits[(ucs >> 24) & 0xf];
3301 *p++ = hexdigits[(ucs >> 20) & 0xf];
3302 *p++ = hexdigits[(ucs >> 16) & 0xf];
3303 *p++ = hexdigits[(ucs >> 12) & 0xf];
3304 *p++ = hexdigits[(ucs >> 8) & 0xf];
3305 *p++ = hexdigits[(ucs >> 4) & 0xf];
3306 *p++ = hexdigits[ucs & 0xf];
3307 continue;
3308 }
3309 /* Fall through: isolated surrogates are copied as-is */
3310 s--;
3311 size++;
3312 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003313#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 /* Map 16-bit characters to '\uxxxx' */
3315 if (ch >= 256) {
3316 *p++ = '\\';
3317 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003318 *p++ = hexdigits[(ch >> 12) & 0xf];
3319 *p++ = hexdigits[(ch >> 8) & 0xf];
3320 *p++ = hexdigits[(ch >> 4) & 0xf];
3321 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322 }
3323 /* Copy everything else as-is */
3324 else
3325 *p++ = (char) ch;
3326 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003327 size = p - q;
3328
3329 done:
3330 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3331 Py_DECREF(repr);
3332 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333}
3334
3335PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3336{
Walter Dörwald711005d2007-05-12 12:03:26 +00003337 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003339 PyErr_BadArgument();
3340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003342 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3343 PyUnicode_GET_SIZE(unicode));
3344
3345 if (!s)
3346 return NULL;
3347 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3348 PyBytes_GET_SIZE(s));
3349 Py_DECREF(s);
3350 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351}
3352
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003353/* --- Unicode Internal Codec ------------------------------------------- */
3354
3355PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003356 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003357 const char *errors)
3358{
3359 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003360 Py_ssize_t startinpos;
3361 Py_ssize_t endinpos;
3362 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003363 PyUnicodeObject *v;
3364 Py_UNICODE *p;
3365 const char *end;
3366 const char *reason;
3367 PyObject *errorHandler = NULL;
3368 PyObject *exc = NULL;
3369
Neal Norwitzd43069c2006-01-08 01:12:10 +00003370#ifdef Py_UNICODE_WIDE
3371 Py_UNICODE unimax = PyUnicode_GetMax();
3372#endif
3373
Thomas Wouters89f507f2006-12-13 04:49:30 +00003374 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003375 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3376 if (v == NULL)
3377 goto onError;
3378 if (PyUnicode_GetSize((PyObject *)v) == 0)
3379 return (PyObject *)v;
3380 p = PyUnicode_AS_UNICODE(v);
3381 end = s + size;
3382
3383 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003384 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003385 /* We have to sanity check the raw data, otherwise doom looms for
3386 some malformed UCS-4 data. */
3387 if (
3388 #ifdef Py_UNICODE_WIDE
3389 *p > unimax || *p < 0 ||
3390 #endif
3391 end-s < Py_UNICODE_SIZE
3392 )
3393 {
3394 startinpos = s - starts;
3395 if (end-s < Py_UNICODE_SIZE) {
3396 endinpos = end-starts;
3397 reason = "truncated input";
3398 }
3399 else {
3400 endinpos = s - starts + Py_UNICODE_SIZE;
3401 reason = "illegal code point (> 0x10FFFF)";
3402 }
3403 outpos = p - PyUnicode_AS_UNICODE(v);
3404 if (unicode_decode_call_errorhandler(
3405 errors, &errorHandler,
3406 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003407 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003408 (PyObject **)&v, &outpos, &p)) {
3409 goto onError;
3410 }
3411 }
3412 else {
3413 p++;
3414 s += Py_UNICODE_SIZE;
3415 }
3416 }
3417
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003418 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003419 goto onError;
3420 Py_XDECREF(errorHandler);
3421 Py_XDECREF(exc);
3422 return (PyObject *)v;
3423
3424 onError:
3425 Py_XDECREF(v);
3426 Py_XDECREF(errorHandler);
3427 Py_XDECREF(exc);
3428 return NULL;
3429}
3430
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431/* --- Latin-1 Codec ------------------------------------------------------ */
3432
3433PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003434 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 const char *errors)
3436{
3437 PyUnicodeObject *v;
3438 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003439
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003441 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003442 Py_UNICODE r = *(unsigned char*)s;
3443 return PyUnicode_FromUnicode(&r, 1);
3444 }
3445
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 v = _PyUnicode_New(size);
3447 if (v == NULL)
3448 goto onError;
3449 if (size == 0)
3450 return (PyObject *)v;
3451 p = PyUnicode_AS_UNICODE(v);
3452 while (size-- > 0)
3453 *p++ = (unsigned char)*s++;
3454 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003455
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 onError:
3457 Py_XDECREF(v);
3458 return NULL;
3459}
3460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461/* create or adjust a UnicodeEncodeError */
3462static void make_encode_exception(PyObject **exceptionObject,
3463 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003464 const Py_UNICODE *unicode, Py_ssize_t size,
3465 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 if (*exceptionObject == NULL) {
3469 *exceptionObject = PyUnicodeEncodeError_Create(
3470 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 }
3472 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3474 goto onError;
3475 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3476 goto onError;
3477 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3478 goto onError;
3479 return;
3480 onError:
3481 Py_DECREF(*exceptionObject);
3482 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 }
3484}
3485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486/* raises a UnicodeEncodeError */
3487static void raise_encode_exception(PyObject **exceptionObject,
3488 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003489 const Py_UNICODE *unicode, Py_ssize_t size,
3490 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 const char *reason)
3492{
3493 make_encode_exception(exceptionObject,
3494 encoding, unicode, size, startpos, endpos, reason);
3495 if (*exceptionObject != NULL)
3496 PyCodec_StrictErrors(*exceptionObject);
3497}
3498
3499/* error handling callback helper:
3500 build arguments, call the callback and check the arguments,
3501 put the result into newpos and return the replacement string, which
3502 has to be freed by the caller */
3503static PyObject *unicode_encode_call_errorhandler(const char *errors,
3504 PyObject **errorHandler,
3505 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003506 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3507 Py_ssize_t startpos, Py_ssize_t endpos,
3508 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003510 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511
3512 PyObject *restuple;
3513 PyObject *resunicode;
3514
3515 if (*errorHandler == NULL) {
3516 *errorHandler = PyCodec_LookupError(errors);
3517 if (*errorHandler == NULL)
3518 return NULL;
3519 }
3520
3521 make_encode_exception(exceptionObject,
3522 encoding, unicode, size, startpos, endpos, reason);
3523 if (*exceptionObject == NULL)
3524 return NULL;
3525
3526 restuple = PyObject_CallFunctionObjArgs(
3527 *errorHandler, *exceptionObject, NULL);
3528 if (restuple == NULL)
3529 return NULL;
3530 if (!PyTuple_Check(restuple)) {
3531 PyErr_Format(PyExc_TypeError, &argparse[4]);
3532 Py_DECREF(restuple);
3533 return NULL;
3534 }
3535 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3536 &resunicode, newpos)) {
3537 Py_DECREF(restuple);
3538 return NULL;
3539 }
3540 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003541 *newpos = size+*newpos;
3542 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003543 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003544 Py_DECREF(restuple);
3545 return NULL;
3546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 Py_INCREF(resunicode);
3548 Py_DECREF(restuple);
3549 return resunicode;
3550}
3551
3552static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003553 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 const char *errors,
3555 int limit)
3556{
3557 /* output object */
3558 PyObject *res;
3559 /* pointers to the beginning and end+1 of input */
3560 const Py_UNICODE *startp = p;
3561 const Py_UNICODE *endp = p + size;
3562 /* pointer to the beginning of the unencodable characters */
3563 /* const Py_UNICODE *badp = NULL; */
3564 /* pointer into the output */
3565 char *str;
3566 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003568 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3569 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 PyObject *errorHandler = NULL;
3571 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003572 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 /* the following variable is used for caching string comparisons
3574 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3575 int known_errorHandler = -1;
3576
3577 /* allocate enough for a simple encoding without
3578 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003579 if (size == 0)
3580 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003581 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003583 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003584 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 ressize = size;
3586
3587 while (p<endp) {
3588 Py_UNICODE c = *p;
3589
3590 /* can we encode this? */
3591 if (c<limit) {
3592 /* no overflow check, because we know that the space is enough */
3593 *str++ = (char)c;
3594 ++p;
3595 }
3596 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003597 Py_ssize_t unicodepos = p-startp;
3598 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003600 Py_ssize_t repsize;
3601 Py_ssize_t newpos;
3602 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 Py_UNICODE *uni2;
3604 /* startpos for collecting unencodable chars */
3605 const Py_UNICODE *collstart = p;
3606 const Py_UNICODE *collend = p;
3607 /* find all unecodable characters */
3608 while ((collend < endp) && ((*collend)>=limit))
3609 ++collend;
3610 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3611 if (known_errorHandler==-1) {
3612 if ((errors==NULL) || (!strcmp(errors, "strict")))
3613 known_errorHandler = 1;
3614 else if (!strcmp(errors, "replace"))
3615 known_errorHandler = 2;
3616 else if (!strcmp(errors, "ignore"))
3617 known_errorHandler = 3;
3618 else if (!strcmp(errors, "xmlcharrefreplace"))
3619 known_errorHandler = 4;
3620 else
3621 known_errorHandler = 0;
3622 }
3623 switch (known_errorHandler) {
3624 case 1: /* strict */
3625 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3626 goto onError;
3627 case 2: /* replace */
3628 while (collstart++<collend)
3629 *str++ = '?'; /* fall through */
3630 case 3: /* ignore */
3631 p = collend;
3632 break;
3633 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003634 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 /* determine replacement size (temporarily (mis)uses p) */
3636 for (p = collstart, repsize = 0; p < collend; ++p) {
3637 if (*p<10)
3638 repsize += 2+1+1;
3639 else if (*p<100)
3640 repsize += 2+2+1;
3641 else if (*p<1000)
3642 repsize += 2+3+1;
3643 else if (*p<10000)
3644 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003645#ifndef Py_UNICODE_WIDE
3646 else
3647 repsize += 2+5+1;
3648#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 else if (*p<100000)
3650 repsize += 2+5+1;
3651 else if (*p<1000000)
3652 repsize += 2+6+1;
3653 else
3654 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003655#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 }
3657 requiredsize = respos+repsize+(endp-collend);
3658 if (requiredsize > ressize) {
3659 if (requiredsize<2*ressize)
3660 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003661 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003663 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 ressize = requiredsize;
3665 }
3666 /* generate replacement (temporarily (mis)uses p) */
3667 for (p = collstart; p < collend; ++p) {
3668 str += sprintf(str, "&#%d;", (int)*p);
3669 }
3670 p = collend;
3671 break;
3672 default:
3673 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3674 encoding, reason, startp, size, &exc,
3675 collstart-startp, collend-startp, &newpos);
3676 if (repunicode == NULL)
3677 goto onError;
3678 /* need more space? (at least enough for what we
3679 have+the replacement+the rest of the string, so
3680 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003681 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 repsize = PyUnicode_GET_SIZE(repunicode);
3683 requiredsize = respos+repsize+(endp-collend);
3684 if (requiredsize > ressize) {
3685 if (requiredsize<2*ressize)
3686 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003687 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 Py_DECREF(repunicode);
3689 goto onError;
3690 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003691 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 ressize = requiredsize;
3693 }
3694 /* check if there is anything unencodable in the replacement
3695 and copy it to the output */
3696 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3697 c = *uni2;
3698 if (c >= limit) {
3699 raise_encode_exception(&exc, encoding, startp, size,
3700 unicodepos, unicodepos+1, reason);
3701 Py_DECREF(repunicode);
3702 goto onError;
3703 }
3704 *str = (char)c;
3705 }
3706 p = startp + newpos;
3707 Py_DECREF(repunicode);
3708 }
3709 }
3710 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003711 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3712 str - PyBytes_AS_STRING(res));
3713 onError:
3714 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715 Py_XDECREF(errorHandler);
3716 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003717 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718}
3719
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003721 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722 const char *errors)
3723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725}
3726
3727PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3728{
3729 if (!PyUnicode_Check(unicode)) {
3730 PyErr_BadArgument();
3731 return NULL;
3732 }
3733 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3734 PyUnicode_GET_SIZE(unicode),
3735 NULL);
3736}
3737
3738/* --- 7-bit ASCII Codec -------------------------------------------------- */
3739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003741 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 const char *errors)
3743{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 PyUnicodeObject *v;
3746 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003747 Py_ssize_t startinpos;
3748 Py_ssize_t endinpos;
3749 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 const char *e;
3751 PyObject *errorHandler = NULL;
3752 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003753
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003755 if (size == 1 && *(unsigned char*)s < 128) {
3756 Py_UNICODE r = *(unsigned char*)s;
3757 return PyUnicode_FromUnicode(&r, 1);
3758 }
Tim Petersced69f82003-09-16 20:30:58 +00003759
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 v = _PyUnicode_New(size);
3761 if (v == NULL)
3762 goto onError;
3763 if (size == 0)
3764 return (PyObject *)v;
3765 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 e = s + size;
3767 while (s < e) {
3768 register unsigned char c = (unsigned char)*s;
3769 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 ++s;
3772 }
3773 else {
3774 startinpos = s-starts;
3775 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003776 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 if (unicode_decode_call_errorhandler(
3778 errors, &errorHandler,
3779 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003780 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003785 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003786 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003787 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 Py_XDECREF(errorHandler);
3789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 onError:
3793 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 Py_XDECREF(errorHandler);
3795 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 return NULL;
3797}
3798
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003800 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 const char *errors)
3802{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003803 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804}
3805
3806PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3807{
3808 if (!PyUnicode_Check(unicode)) {
3809 PyErr_BadArgument();
3810 return NULL;
3811 }
3812 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3813 PyUnicode_GET_SIZE(unicode),
3814 NULL);
3815}
3816
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003817#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003818
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003819/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003820
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003821#if SIZEOF_INT < SIZEOF_SSIZE_T
3822#define NEED_RETRY
3823#endif
3824
3825/* XXX This code is limited to "true" double-byte encodings, as
3826 a) it assumes an incomplete character consists of a single byte, and
3827 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3828 encodings, see IsDBCSLeadByteEx documentation. */
3829
3830static int is_dbcs_lead_byte(const char *s, int offset)
3831{
3832 const char *curr = s + offset;
3833
3834 if (IsDBCSLeadByte(*curr)) {
3835 const char *prev = CharPrev(s, curr);
3836 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3837 }
3838 return 0;
3839}
3840
3841/*
3842 * Decode MBCS string into unicode object. If 'final' is set, converts
3843 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3844 */
3845static int decode_mbcs(PyUnicodeObject **v,
3846 const char *s, /* MBCS string */
3847 int size, /* sizeof MBCS string */
3848 int final)
3849{
3850 Py_UNICODE *p;
3851 Py_ssize_t n = 0;
3852 int usize = 0;
3853
3854 assert(size >= 0);
3855
3856 /* Skip trailing lead-byte unless 'final' is set */
3857 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3858 --size;
3859
3860 /* First get the size of the result */
3861 if (size > 0) {
3862 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3863 if (usize == 0) {
3864 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3865 return -1;
3866 }
3867 }
3868
3869 if (*v == NULL) {
3870 /* Create unicode object */
3871 *v = _PyUnicode_New(usize);
3872 if (*v == NULL)
3873 return -1;
3874 }
3875 else {
3876 /* Extend unicode object */
3877 n = PyUnicode_GET_SIZE(*v);
3878 if (_PyUnicode_Resize(v, n + usize) < 0)
3879 return -1;
3880 }
3881
3882 /* Do the conversion */
3883 if (size > 0) {
3884 p = PyUnicode_AS_UNICODE(*v) + n;
3885 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3886 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3887 return -1;
3888 }
3889 }
3890
3891 return size;
3892}
3893
3894PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3895 Py_ssize_t size,
3896 const char *errors,
3897 Py_ssize_t *consumed)
3898{
3899 PyUnicodeObject *v = NULL;
3900 int done;
3901
3902 if (consumed)
3903 *consumed = 0;
3904
3905#ifdef NEED_RETRY
3906 retry:
3907 if (size > INT_MAX)
3908 done = decode_mbcs(&v, s, INT_MAX, 0);
3909 else
3910#endif
3911 done = decode_mbcs(&v, s, (int)size, !consumed);
3912
3913 if (done < 0) {
3914 Py_XDECREF(v);
3915 return NULL;
3916 }
3917
3918 if (consumed)
3919 *consumed += done;
3920
3921#ifdef NEED_RETRY
3922 if (size > INT_MAX) {
3923 s += done;
3924 size -= done;
3925 goto retry;
3926 }
3927#endif
3928
3929 return (PyObject *)v;
3930}
3931
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003932PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003933 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003934 const char *errors)
3935{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003936 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3937}
3938
3939/*
3940 * Convert unicode into string object (MBCS).
3941 * Returns 0 if succeed, -1 otherwise.
3942 */
3943static int encode_mbcs(PyObject **repr,
3944 const Py_UNICODE *p, /* unicode */
3945 int size) /* size of unicode */
3946{
3947 int mbcssize = 0;
3948 Py_ssize_t n = 0;
3949
3950 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003951
3952 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003953 if (size > 0) {
3954 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3955 if (mbcssize == 0) {
3956 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3957 return -1;
3958 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003959 }
3960
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003961 if (*repr == NULL) {
3962 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003963 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003964 if (*repr == NULL)
3965 return -1;
3966 }
3967 else {
3968 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003969 n = PyString_Size(*repr);
3970 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003971 return -1;
3972 }
3973
3974 /* Do the conversion */
3975 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003976 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003977 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3978 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3979 return -1;
3980 }
3981 }
3982
3983 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003984}
3985
3986PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003987 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003988 const char *errors)
3989{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003990 PyObject *repr = NULL;
3991 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003992
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003993#ifdef NEED_RETRY
3994 retry:
3995 if (size > INT_MAX)
3996 ret = encode_mbcs(&repr, p, INT_MAX);
3997 else
3998#endif
3999 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004000
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004001 if (ret < 0) {
4002 Py_XDECREF(repr);
4003 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004004 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004005
4006#ifdef NEED_RETRY
4007 if (size > INT_MAX) {
4008 p += INT_MAX;
4009 size -= INT_MAX;
4010 goto retry;
4011 }
4012#endif
4013
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004014 return repr;
4015}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004016
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004017PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4018{
4019 if (!PyUnicode_Check(unicode)) {
4020 PyErr_BadArgument();
4021 return NULL;
4022 }
4023 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4024 PyUnicode_GET_SIZE(unicode),
4025 NULL);
4026}
4027
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004028#undef NEED_RETRY
4029
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004030#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004031
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032/* --- Character Mapping Codec -------------------------------------------- */
4033
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 PyObject *mapping,
4037 const char *errors)
4038{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004040 Py_ssize_t startinpos;
4041 Py_ssize_t endinpos;
4042 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 PyUnicodeObject *v;
4045 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004046 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 PyObject *errorHandler = NULL;
4048 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004049 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004050 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 /* Default to Latin-1 */
4053 if (mapping == NULL)
4054 return PyUnicode_DecodeLatin1(s, size, errors);
4055
4056 v = _PyUnicode_New(size);
4057 if (v == NULL)
4058 goto onError;
4059 if (size == 0)
4060 return (PyObject *)v;
4061 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004063 if (PyUnicode_CheckExact(mapping)) {
4064 mapstring = PyUnicode_AS_UNICODE(mapping);
4065 maplen = PyUnicode_GET_SIZE(mapping);
4066 while (s < e) {
4067 unsigned char ch = *s;
4068 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004070 if (ch < maplen)
4071 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004073 if (x == 0xfffe) {
4074 /* undefined mapping */
4075 outpos = p-PyUnicode_AS_UNICODE(v);
4076 startinpos = s-starts;
4077 endinpos = startinpos+1;
4078 if (unicode_decode_call_errorhandler(
4079 errors, &errorHandler,
4080 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004081 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004082 (PyObject **)&v, &outpos, &p)) {
4083 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004084 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004085 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004086 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004087 *p++ = x;
4088 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004090 }
4091 else {
4092 while (s < e) {
4093 unsigned char ch = *s;
4094 PyObject *w, *x;
4095
4096 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004097 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004098 if (w == NULL)
4099 goto onError;
4100 x = PyObject_GetItem(mapping, w);
4101 Py_DECREF(w);
4102 if (x == NULL) {
4103 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4104 /* No mapping found means: mapping is undefined. */
4105 PyErr_Clear();
4106 x = Py_None;
4107 Py_INCREF(x);
4108 } else
4109 goto onError;
4110 }
4111
4112 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004113 if (PyLong_Check(x)) {
4114 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004115 if (value < 0 || value > 65535) {
4116 PyErr_SetString(PyExc_TypeError,
4117 "character mapping must be in range(65536)");
4118 Py_DECREF(x);
4119 goto onError;
4120 }
4121 *p++ = (Py_UNICODE)value;
4122 }
4123 else if (x == Py_None) {
4124 /* undefined mapping */
4125 outpos = p-PyUnicode_AS_UNICODE(v);
4126 startinpos = s-starts;
4127 endinpos = startinpos+1;
4128 if (unicode_decode_call_errorhandler(
4129 errors, &errorHandler,
4130 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004131 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004132 (PyObject **)&v, &outpos, &p)) {
4133 Py_DECREF(x);
4134 goto onError;
4135 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004136 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004137 continue;
4138 }
4139 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004140 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004141
4142 if (targetsize == 1)
4143 /* 1-1 mapping */
4144 *p++ = *PyUnicode_AS_UNICODE(x);
4145
4146 else if (targetsize > 1) {
4147 /* 1-n mapping */
4148 if (targetsize > extrachars) {
4149 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004150 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4151 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004152 (targetsize << 2);
4153 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004154 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004155 if (_PyUnicode_Resize(&v,
4156 PyUnicode_GET_SIZE(v) + needed) < 0) {
4157 Py_DECREF(x);
4158 goto onError;
4159 }
4160 p = PyUnicode_AS_UNICODE(v) + oldpos;
4161 }
4162 Py_UNICODE_COPY(p,
4163 PyUnicode_AS_UNICODE(x),
4164 targetsize);
4165 p += targetsize;
4166 extrachars -= targetsize;
4167 }
4168 /* 1-0 mapping: skip the character */
4169 }
4170 else {
4171 /* wrong return value */
4172 PyErr_SetString(PyExc_TypeError,
4173 "character mapping must return integer, None or unicode");
4174 Py_DECREF(x);
4175 goto onError;
4176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004178 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 }
4181 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004182 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 Py_XDECREF(errorHandler);
4185 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004187
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 Py_XDECREF(errorHandler);
4190 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 Py_XDECREF(v);
4192 return NULL;
4193}
4194
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004195/* Charmap encoding: the lookup table */
4196
4197struct encoding_map{
4198 PyObject_HEAD
4199 unsigned char level1[32];
4200 int count2, count3;
4201 unsigned char level23[1];
4202};
4203
4204static PyObject*
4205encoding_map_size(PyObject *obj, PyObject* args)
4206{
4207 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004208 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004209 128*map->count3);
4210}
4211
4212static PyMethodDef encoding_map_methods[] = {
4213 {"size", encoding_map_size, METH_NOARGS,
4214 PyDoc_STR("Return the size (in bytes) of this object") },
4215 { 0 }
4216};
4217
4218static void
4219encoding_map_dealloc(PyObject* o)
4220{
4221 PyObject_FREE(o);
4222}
4223
4224static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004225 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004226 "EncodingMap", /*tp_name*/
4227 sizeof(struct encoding_map), /*tp_basicsize*/
4228 0, /*tp_itemsize*/
4229 /* methods */
4230 encoding_map_dealloc, /*tp_dealloc*/
4231 0, /*tp_print*/
4232 0, /*tp_getattr*/
4233 0, /*tp_setattr*/
4234 0, /*tp_compare*/
4235 0, /*tp_repr*/
4236 0, /*tp_as_number*/
4237 0, /*tp_as_sequence*/
4238 0, /*tp_as_mapping*/
4239 0, /*tp_hash*/
4240 0, /*tp_call*/
4241 0, /*tp_str*/
4242 0, /*tp_getattro*/
4243 0, /*tp_setattro*/
4244 0, /*tp_as_buffer*/
4245 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4246 0, /*tp_doc*/
4247 0, /*tp_traverse*/
4248 0, /*tp_clear*/
4249 0, /*tp_richcompare*/
4250 0, /*tp_weaklistoffset*/
4251 0, /*tp_iter*/
4252 0, /*tp_iternext*/
4253 encoding_map_methods, /*tp_methods*/
4254 0, /*tp_members*/
4255 0, /*tp_getset*/
4256 0, /*tp_base*/
4257 0, /*tp_dict*/
4258 0, /*tp_descr_get*/
4259 0, /*tp_descr_set*/
4260 0, /*tp_dictoffset*/
4261 0, /*tp_init*/
4262 0, /*tp_alloc*/
4263 0, /*tp_new*/
4264 0, /*tp_free*/
4265 0, /*tp_is_gc*/
4266};
4267
4268PyObject*
4269PyUnicode_BuildEncodingMap(PyObject* string)
4270{
4271 Py_UNICODE *decode;
4272 PyObject *result;
4273 struct encoding_map *mresult;
4274 int i;
4275 int need_dict = 0;
4276 unsigned char level1[32];
4277 unsigned char level2[512];
4278 unsigned char *mlevel1, *mlevel2, *mlevel3;
4279 int count2 = 0, count3 = 0;
4280
4281 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4282 PyErr_BadArgument();
4283 return NULL;
4284 }
4285 decode = PyUnicode_AS_UNICODE(string);
4286 memset(level1, 0xFF, sizeof level1);
4287 memset(level2, 0xFF, sizeof level2);
4288
4289 /* If there isn't a one-to-one mapping of NULL to \0,
4290 or if there are non-BMP characters, we need to use
4291 a mapping dictionary. */
4292 if (decode[0] != 0)
4293 need_dict = 1;
4294 for (i = 1; i < 256; i++) {
4295 int l1, l2;
4296 if (decode[i] == 0
4297 #ifdef Py_UNICODE_WIDE
4298 || decode[i] > 0xFFFF
4299 #endif
4300 ) {
4301 need_dict = 1;
4302 break;
4303 }
4304 if (decode[i] == 0xFFFE)
4305 /* unmapped character */
4306 continue;
4307 l1 = decode[i] >> 11;
4308 l2 = decode[i] >> 7;
4309 if (level1[l1] == 0xFF)
4310 level1[l1] = count2++;
4311 if (level2[l2] == 0xFF)
4312 level2[l2] = count3++;
4313 }
4314
4315 if (count2 >= 0xFF || count3 >= 0xFF)
4316 need_dict = 1;
4317
4318 if (need_dict) {
4319 PyObject *result = PyDict_New();
4320 PyObject *key, *value;
4321 if (!result)
4322 return NULL;
4323 for (i = 0; i < 256; i++) {
4324 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004325 key = PyLong_FromLong(decode[i]);
4326 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004327 if (!key || !value)
4328 goto failed1;
4329 if (PyDict_SetItem(result, key, value) == -1)
4330 goto failed1;
4331 Py_DECREF(key);
4332 Py_DECREF(value);
4333 }
4334 return result;
4335 failed1:
4336 Py_XDECREF(key);
4337 Py_XDECREF(value);
4338 Py_DECREF(result);
4339 return NULL;
4340 }
4341
4342 /* Create a three-level trie */
4343 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4344 16*count2 + 128*count3 - 1);
4345 if (!result)
4346 return PyErr_NoMemory();
4347 PyObject_Init(result, &EncodingMapType);
4348 mresult = (struct encoding_map*)result;
4349 mresult->count2 = count2;
4350 mresult->count3 = count3;
4351 mlevel1 = mresult->level1;
4352 mlevel2 = mresult->level23;
4353 mlevel3 = mresult->level23 + 16*count2;
4354 memcpy(mlevel1, level1, 32);
4355 memset(mlevel2, 0xFF, 16*count2);
4356 memset(mlevel3, 0, 128*count3);
4357 count3 = 0;
4358 for (i = 1; i < 256; i++) {
4359 int o1, o2, o3, i2, i3;
4360 if (decode[i] == 0xFFFE)
4361 /* unmapped character */
4362 continue;
4363 o1 = decode[i]>>11;
4364 o2 = (decode[i]>>7) & 0xF;
4365 i2 = 16*mlevel1[o1] + o2;
4366 if (mlevel2[i2] == 0xFF)
4367 mlevel2[i2] = count3++;
4368 o3 = decode[i] & 0x7F;
4369 i3 = 128*mlevel2[i2] + o3;
4370 mlevel3[i3] = i;
4371 }
4372 return result;
4373}
4374
4375static int
4376encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4377{
4378 struct encoding_map *map = (struct encoding_map*)mapping;
4379 int l1 = c>>11;
4380 int l2 = (c>>7) & 0xF;
4381 int l3 = c & 0x7F;
4382 int i;
4383
4384#ifdef Py_UNICODE_WIDE
4385 if (c > 0xFFFF) {
4386 return -1;
4387 }
4388#endif
4389 if (c == 0)
4390 return 0;
4391 /* level 1*/
4392 i = map->level1[l1];
4393 if (i == 0xFF) {
4394 return -1;
4395 }
4396 /* level 2*/
4397 i = map->level23[16*i+l2];
4398 if (i == 0xFF) {
4399 return -1;
4400 }
4401 /* level 3 */
4402 i = map->level23[16*map->count2 + 128*i + l3];
4403 if (i == 0) {
4404 return -1;
4405 }
4406 return i;
4407}
4408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409/* Lookup the character ch in the mapping. If the character
4410 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004411 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413{
Christian Heimes217cfd12007-12-02 14:31:20 +00004414 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 PyObject *x;
4416
4417 if (w == NULL)
4418 return NULL;
4419 x = PyObject_GetItem(mapping, w);
4420 Py_DECREF(w);
4421 if (x == NULL) {
4422 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4423 /* No mapping found means: mapping is undefined. */
4424 PyErr_Clear();
4425 x = Py_None;
4426 Py_INCREF(x);
4427 return x;
4428 } else
4429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004431 else if (x == Py_None)
4432 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004433 else if (PyLong_Check(x)) {
4434 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 if (value < 0 || value > 255) {
4436 PyErr_SetString(PyExc_TypeError,
4437 "character mapping must be in range(256)");
4438 Py_DECREF(x);
4439 return NULL;
4440 }
4441 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 else if (PyString_Check(x))
4444 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004447 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004448 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004449 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 Py_DECREF(x);
4451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 }
4453}
4454
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004455static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004456charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004457{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004458 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004459 /* exponentially overallocate to minimize reallocations */
4460 if (requiredsize < 2*outsize)
4461 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004462 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004463 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004464 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004465}
4466
4467typedef enum charmapencode_result {
4468 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4469}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004471 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 space is available. Return a new reference to the object that
4473 was put in the output buffer, or Py_None, if the mapping was undefined
4474 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004475 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004477charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004478 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004480 PyObject *rep;
4481 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004482 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483
Christian Heimes90aa7642007-12-19 02:45:37 +00004484 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004485 int res = encoding_map_lookup(c, mapping);
4486 Py_ssize_t requiredsize = *outpos+1;
4487 if (res == -1)
4488 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004489 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004490 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004491 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004492 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004493 outstart[(*outpos)++] = (char)res;
4494 return enc_SUCCESS;
4495 }
4496
4497 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004499 return enc_EXCEPTION;
4500 else if (rep==Py_None) {
4501 Py_DECREF(rep);
4502 return enc_FAILED;
4503 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004504 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004505 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004506 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004507 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004509 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004511 outstart = PyString_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004512 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 }
4514 else {
4515 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004516 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4517 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004518 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004519 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004521 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004523 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 memcpy(outstart + *outpos, repchars, repsize);
4525 *outpos += repsize;
4526 }
4527 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004528 Py_DECREF(rep);
4529 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530}
4531
4532/* handle an error in PyUnicode_EncodeCharmap
4533 Return 0 on success, -1 on error */
4534static
4535int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004536 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004538 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004539 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540{
4541 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004542 Py_ssize_t repsize;
4543 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 Py_UNICODE *uni2;
4545 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004546 Py_ssize_t collstartpos = *inpos;
4547 Py_ssize_t collendpos = *inpos+1;
4548 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 char *encoding = "charmap";
4550 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004551 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 /* find all unencodable characters */
4554 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004555 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004556 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004557 int res = encoding_map_lookup(p[collendpos], mapping);
4558 if (res != -1)
4559 break;
4560 ++collendpos;
4561 continue;
4562 }
4563
4564 rep = charmapencode_lookup(p[collendpos], mapping);
4565 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004567 else if (rep!=Py_None) {
4568 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 break;
4570 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004571 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 ++collendpos;
4573 }
4574 /* cache callback name lookup
4575 * (if not done yet, i.e. it's the first error) */
4576 if (*known_errorHandler==-1) {
4577 if ((errors==NULL) || (!strcmp(errors, "strict")))
4578 *known_errorHandler = 1;
4579 else if (!strcmp(errors, "replace"))
4580 *known_errorHandler = 2;
4581 else if (!strcmp(errors, "ignore"))
4582 *known_errorHandler = 3;
4583 else if (!strcmp(errors, "xmlcharrefreplace"))
4584 *known_errorHandler = 4;
4585 else
4586 *known_errorHandler = 0;
4587 }
4588 switch (*known_errorHandler) {
4589 case 1: /* strict */
4590 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4591 return -1;
4592 case 2: /* replace */
4593 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4594 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004595 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 return -1;
4597 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004598 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4600 return -1;
4601 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 }
4603 /* fall through */
4604 case 3: /* ignore */
4605 *inpos = collendpos;
4606 break;
4607 case 4: /* xmlcharrefreplace */
4608 /* generate replacement (temporarily (mis)uses p) */
4609 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4610 char buffer[2+29+1+1];
4611 char *cp;
4612 sprintf(buffer, "&#%d;", (int)p[collpos]);
4613 for (cp = buffer; *cp; ++cp) {
4614 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004615 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004617 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619 return -1;
4620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 }
4622 }
4623 *inpos = collendpos;
4624 break;
4625 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004626 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 encoding, reason, p, size, exceptionObject,
4628 collstartpos, collendpos, &newpos);
4629 if (repunicode == NULL)
4630 return -1;
4631 /* generate replacement */
4632 repsize = PyUnicode_GET_SIZE(repunicode);
4633 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4634 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004635 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 return -1;
4637 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004638 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4641 return -1;
4642 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 }
4644 *inpos = newpos;
4645 Py_DECREF(repunicode);
4646 }
4647 return 0;
4648}
4649
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 PyObject *mapping,
4653 const char *errors)
4654{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 /* output object */
4656 PyObject *res = NULL;
4657 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 PyObject *errorHandler = NULL;
4662 PyObject *exc = NULL;
4663 /* the following variable is used for caching string comparisons
4664 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4665 * 3=ignore, 4=xmlcharrefreplace */
4666 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667
4668 /* Default to Latin-1 */
4669 if (mapping == NULL)
4670 return PyUnicode_EncodeLatin1(p, size, errors);
4671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 /* allocate enough for a simple encoding without
4673 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004674 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 if (res == NULL)
4676 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004677 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 while (inpos<size) {
4681 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004682 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004683 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004685 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 if (charmap_encoding_error(p, size, &inpos, mapping,
4687 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004688 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004689 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004690 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 else
4694 /* done with this character => adjust input position */
4695 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004699 if (respos<PyString_GET_SIZE(res))
4700 _PyString_Resize(&res, respos);
4701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 Py_XDECREF(exc);
4703 Py_XDECREF(errorHandler);
4704 return res;
4705
4706 onError:
4707 Py_XDECREF(res);
4708 Py_XDECREF(exc);
4709 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 return NULL;
4711}
4712
4713PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4714 PyObject *mapping)
4715{
4716 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4717 PyErr_BadArgument();
4718 return NULL;
4719 }
4720 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4721 PyUnicode_GET_SIZE(unicode),
4722 mapping,
4723 NULL);
4724}
4725
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726/* create or adjust a UnicodeTranslateError */
4727static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004728 const Py_UNICODE *unicode, Py_ssize_t size,
4729 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 if (*exceptionObject == NULL) {
4733 *exceptionObject = PyUnicodeTranslateError_Create(
4734 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 }
4736 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4738 goto onError;
4739 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4740 goto onError;
4741 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4742 goto onError;
4743 return;
4744 onError:
4745 Py_DECREF(*exceptionObject);
4746 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 }
4748}
4749
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750/* raises a UnicodeTranslateError */
4751static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004752 const Py_UNICODE *unicode, Py_ssize_t size,
4753 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 const char *reason)
4755{
4756 make_translate_exception(exceptionObject,
4757 unicode, size, startpos, endpos, reason);
4758 if (*exceptionObject != NULL)
4759 PyCodec_StrictErrors(*exceptionObject);
4760}
4761
4762/* error handling callback helper:
4763 build arguments, call the callback and check the arguments,
4764 put the result into newpos and return the replacement string, which
4765 has to be freed by the caller */
4766static PyObject *unicode_translate_call_errorhandler(const char *errors,
4767 PyObject **errorHandler,
4768 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4770 Py_ssize_t startpos, Py_ssize_t endpos,
4771 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004773 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004775 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 PyObject *restuple;
4777 PyObject *resunicode;
4778
4779 if (*errorHandler == NULL) {
4780 *errorHandler = PyCodec_LookupError(errors);
4781 if (*errorHandler == NULL)
4782 return NULL;
4783 }
4784
4785 make_translate_exception(exceptionObject,
4786 unicode, size, startpos, endpos, reason);
4787 if (*exceptionObject == NULL)
4788 return NULL;
4789
4790 restuple = PyObject_CallFunctionObjArgs(
4791 *errorHandler, *exceptionObject, NULL);
4792 if (restuple == NULL)
4793 return NULL;
4794 if (!PyTuple_Check(restuple)) {
4795 PyErr_Format(PyExc_TypeError, &argparse[4]);
4796 Py_DECREF(restuple);
4797 return NULL;
4798 }
4799 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004800 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 Py_DECREF(restuple);
4802 return NULL;
4803 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004804 if (i_newpos<0)
4805 *newpos = size+i_newpos;
4806 else
4807 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004808 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004809 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004810 Py_DECREF(restuple);
4811 return NULL;
4812 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 Py_INCREF(resunicode);
4814 Py_DECREF(restuple);
4815 return resunicode;
4816}
4817
4818/* Lookup the character ch in the mapping and put the result in result,
4819 which must be decrefed by the caller.
4820 Return 0 on success, -1 on error */
4821static
4822int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4823{
Christian Heimes217cfd12007-12-02 14:31:20 +00004824 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004825 PyObject *x;
4826
4827 if (w == NULL)
4828 return -1;
4829 x = PyObject_GetItem(mapping, w);
4830 Py_DECREF(w);
4831 if (x == NULL) {
4832 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4833 /* No mapping found means: use 1:1 mapping. */
4834 PyErr_Clear();
4835 *result = NULL;
4836 return 0;
4837 } else
4838 return -1;
4839 }
4840 else if (x == Py_None) {
4841 *result = x;
4842 return 0;
4843 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004844 else if (PyLong_Check(x)) {
4845 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 long max = PyUnicode_GetMax();
4847 if (value < 0 || value > max) {
4848 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004849 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 Py_DECREF(x);
4851 return -1;
4852 }
4853 *result = x;
4854 return 0;
4855 }
4856 else if (PyUnicode_Check(x)) {
4857 *result = x;
4858 return 0;
4859 }
4860 else {
4861 /* wrong return value */
4862 PyErr_SetString(PyExc_TypeError,
4863 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004864 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 return -1;
4866 }
4867}
4868/* ensure that *outobj is at least requiredsize characters long,
4869if not reallocate and adjust various state variables.
4870Return 0 on success, -1 on error */
4871static
Walter Dörwald4894c302003-10-24 14:25:28 +00004872int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004875 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004876 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004878 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004880 if (requiredsize < 2 * oldsize)
4881 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004882 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 return -1;
4884 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 }
4886 return 0;
4887}
4888/* lookup the character, put the result in the output string and adjust
4889 various state variables. Return a new reference to the object that
4890 was put in the output buffer in *result, or Py_None, if the mapping was
4891 undefined (in which case no character was written).
4892 The called must decref result.
4893 Return 0 on success, -1 on error. */
4894static
Walter Dörwald4894c302003-10-24 14:25:28 +00004895int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004897 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898{
Walter Dörwald4894c302003-10-24 14:25:28 +00004899 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 return -1;
4901 if (*res==NULL) {
4902 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004903 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 }
4905 else if (*res==Py_None)
4906 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00004907 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00004909 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 }
4911 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004912 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 if (repsize==1) {
4914 /* no overflow check, because we know that the space is enough */
4915 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4916 }
4917 else if (repsize!=0) {
4918 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004919 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004920 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004921 repsize - 1;
4922 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 return -1;
4924 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4925 *outp += repsize;
4926 }
4927 }
4928 else
4929 return -1;
4930 return 0;
4931}
4932
4933PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004934 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 PyObject *mapping,
4936 const char *errors)
4937{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 /* output object */
4939 PyObject *res = NULL;
4940 /* pointers to the beginning and end+1 of input */
4941 const Py_UNICODE *startp = p;
4942 const Py_UNICODE *endp = p + size;
4943 /* pointer into the output */
4944 Py_UNICODE *str;
4945 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004946 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 char *reason = "character maps to <undefined>";
4948 PyObject *errorHandler = NULL;
4949 PyObject *exc = NULL;
4950 /* the following variable is used for caching string comparisons
4951 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4952 * 3=ignore, 4=xmlcharrefreplace */
4953 int known_errorHandler = -1;
4954
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 if (mapping == NULL) {
4956 PyErr_BadArgument();
4957 return NULL;
4958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959
4960 /* allocate enough for a simple 1:1 translation without
4961 replacements, if we need more, we'll resize */
4962 res = PyUnicode_FromUnicode(NULL, size);
4963 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004964 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 return res;
4967 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 while (p<endp) {
4970 /* try to encode it */
4971 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004972 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 goto onError;
4975 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004976 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 if (x!=Py_None) /* it worked => adjust input pointer */
4978 ++p;
4979 else { /* untranslatable character */
4980 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004981 Py_ssize_t repsize;
4982 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983 Py_UNICODE *uni2;
4984 /* startpos for collecting untranslatable chars */
4985 const Py_UNICODE *collstart = p;
4986 const Py_UNICODE *collend = p+1;
4987 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 /* find all untranslatable characters */
4990 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004991 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 goto onError;
4993 Py_XDECREF(x);
4994 if (x!=Py_None)
4995 break;
4996 ++collend;
4997 }
4998 /* cache callback name lookup
4999 * (if not done yet, i.e. it's the first error) */
5000 if (known_errorHandler==-1) {
5001 if ((errors==NULL) || (!strcmp(errors, "strict")))
5002 known_errorHandler = 1;
5003 else if (!strcmp(errors, "replace"))
5004 known_errorHandler = 2;
5005 else if (!strcmp(errors, "ignore"))
5006 known_errorHandler = 3;
5007 else if (!strcmp(errors, "xmlcharrefreplace"))
5008 known_errorHandler = 4;
5009 else
5010 known_errorHandler = 0;
5011 }
5012 switch (known_errorHandler) {
5013 case 1: /* strict */
5014 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5015 goto onError;
5016 case 2: /* replace */
5017 /* No need to check for space, this is a 1:1 replacement */
5018 for (coll = collstart; coll<collend; ++coll)
5019 *str++ = '?';
5020 /* fall through */
5021 case 3: /* ignore */
5022 p = collend;
5023 break;
5024 case 4: /* xmlcharrefreplace */
5025 /* generate replacement (temporarily (mis)uses p) */
5026 for (p = collstart; p < collend; ++p) {
5027 char buffer[2+29+1+1];
5028 char *cp;
5029 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005030 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5032 goto onError;
5033 for (cp = buffer; *cp; ++cp)
5034 *str++ = *cp;
5035 }
5036 p = collend;
5037 break;
5038 default:
5039 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5040 reason, startp, size, &exc,
5041 collstart-startp, collend-startp, &newpos);
5042 if (repunicode == NULL)
5043 goto onError;
5044 /* generate replacement */
5045 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005046 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5048 Py_DECREF(repunicode);
5049 goto onError;
5050 }
5051 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5052 *str++ = *uni2;
5053 p = startp + newpos;
5054 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 }
5056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 /* Resize if we allocated to much */
5059 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005060 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005061 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005063 }
5064 Py_XDECREF(exc);
5065 Py_XDECREF(errorHandler);
5066 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068 onError:
5069 Py_XDECREF(res);
5070 Py_XDECREF(exc);
5071 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 return NULL;
5073}
5074
5075PyObject *PyUnicode_Translate(PyObject *str,
5076 PyObject *mapping,
5077 const char *errors)
5078{
5079 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005080
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 str = PyUnicode_FromObject(str);
5082 if (str == NULL)
5083 goto onError;
5084 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5085 PyUnicode_GET_SIZE(str),
5086 mapping,
5087 errors);
5088 Py_DECREF(str);
5089 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005090
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 onError:
5092 Py_XDECREF(str);
5093 return NULL;
5094}
Tim Petersced69f82003-09-16 20:30:58 +00005095
Guido van Rossum9e896b32000-04-05 20:11:21 +00005096/* --- Decimal Encoder ---------------------------------------------------- */
5097
5098int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005099 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005100 char *output,
5101 const char *errors)
5102{
5103 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104 PyObject *errorHandler = NULL;
5105 PyObject *exc = NULL;
5106 const char *encoding = "decimal";
5107 const char *reason = "invalid decimal Unicode string";
5108 /* the following variable is used for caching string comparisons
5109 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5110 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005111
5112 if (output == NULL) {
5113 PyErr_BadArgument();
5114 return -1;
5115 }
5116
5117 p = s;
5118 end = s + length;
5119 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005120 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005121 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005123 Py_ssize_t repsize;
5124 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 Py_UNICODE *uni2;
5126 Py_UNICODE *collstart;
5127 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005128
Guido van Rossum9e896b32000-04-05 20:11:21 +00005129 if (Py_UNICODE_ISSPACE(ch)) {
5130 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005131 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005132 continue;
5133 }
5134 decimal = Py_UNICODE_TODECIMAL(ch);
5135 if (decimal >= 0) {
5136 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005138 continue;
5139 }
Guido van Rossumba477042000-04-06 18:18:10 +00005140 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005141 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005142 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005143 continue;
5144 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005145 /* All other characters are considered unencodable */
5146 collstart = p;
5147 collend = p+1;
5148 while (collend < end) {
5149 if ((0 < *collend && *collend < 256) ||
5150 !Py_UNICODE_ISSPACE(*collend) ||
5151 Py_UNICODE_TODECIMAL(*collend))
5152 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005154 /* cache callback name lookup
5155 * (if not done yet, i.e. it's the first error) */
5156 if (known_errorHandler==-1) {
5157 if ((errors==NULL) || (!strcmp(errors, "strict")))
5158 known_errorHandler = 1;
5159 else if (!strcmp(errors, "replace"))
5160 known_errorHandler = 2;
5161 else if (!strcmp(errors, "ignore"))
5162 known_errorHandler = 3;
5163 else if (!strcmp(errors, "xmlcharrefreplace"))
5164 known_errorHandler = 4;
5165 else
5166 known_errorHandler = 0;
5167 }
5168 switch (known_errorHandler) {
5169 case 1: /* strict */
5170 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5171 goto onError;
5172 case 2: /* replace */
5173 for (p = collstart; p < collend; ++p)
5174 *output++ = '?';
5175 /* fall through */
5176 case 3: /* ignore */
5177 p = collend;
5178 break;
5179 case 4: /* xmlcharrefreplace */
5180 /* generate replacement (temporarily (mis)uses p) */
5181 for (p = collstart; p < collend; ++p)
5182 output += sprintf(output, "&#%d;", (int)*p);
5183 p = collend;
5184 break;
5185 default:
5186 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5187 encoding, reason, s, length, &exc,
5188 collstart-s, collend-s, &newpos);
5189 if (repunicode == NULL)
5190 goto onError;
5191 /* generate replacement */
5192 repsize = PyUnicode_GET_SIZE(repunicode);
5193 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5194 Py_UNICODE ch = *uni2;
5195 if (Py_UNICODE_ISSPACE(ch))
5196 *output++ = ' ';
5197 else {
5198 decimal = Py_UNICODE_TODECIMAL(ch);
5199 if (decimal >= 0)
5200 *output++ = '0' + decimal;
5201 else if (0 < ch && ch < 256)
5202 *output++ = (char)ch;
5203 else {
5204 Py_DECREF(repunicode);
5205 raise_encode_exception(&exc, encoding,
5206 s, length, collstart-s, collend-s, reason);
5207 goto onError;
5208 }
5209 }
5210 }
5211 p = s + newpos;
5212 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005213 }
5214 }
5215 /* 0-terminate the output string */
5216 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 Py_XDECREF(exc);
5218 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005219 return 0;
5220
5221 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 Py_XDECREF(exc);
5223 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005224 return -1;
5225}
5226
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227/* --- Helpers ------------------------------------------------------------ */
5228
Eric Smith8c663262007-08-25 02:26:07 +00005229#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005230#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005231#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005232/* Include _ParseTupleFinds from find.h */
5233#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005234#include "stringlib/find.h"
5235#include "stringlib/partition.h"
5236
5237/* helper macro to fixup start/end slice values */
5238#define FIX_START_END(obj) \
5239 if (start < 0) \
5240 start += (obj)->length; \
5241 if (start < 0) \
5242 start = 0; \
5243 if (end > (obj)->length) \
5244 end = (obj)->length; \
5245 if (end < 0) \
5246 end += (obj)->length; \
5247 if (end < 0) \
5248 end = 0;
5249
Martin v. Löwis18e16552006-02-15 17:27:45 +00005250Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005251 PyObject *substr,
5252 Py_ssize_t start,
5253 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005255 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005256 PyUnicodeObject* str_obj;
5257 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005258
Thomas Wouters477c8d52006-05-27 19:21:47 +00005259 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5260 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005262 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5263 if (!sub_obj) {
5264 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 return -1;
5266 }
Tim Petersced69f82003-09-16 20:30:58 +00005267
Thomas Wouters477c8d52006-05-27 19:21:47 +00005268 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005269
Thomas Wouters477c8d52006-05-27 19:21:47 +00005270 result = stringlib_count(
5271 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5272 );
5273
5274 Py_DECREF(sub_obj);
5275 Py_DECREF(str_obj);
5276
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 return result;
5278}
5279
Martin v. Löwis18e16552006-02-15 17:27:45 +00005280Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005281 PyObject *sub,
5282 Py_ssize_t start,
5283 Py_ssize_t end,
5284 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005286 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005287
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005289 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005290 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005291 sub = PyUnicode_FromObject(sub);
5292 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005293 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005294 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 }
Tim Petersced69f82003-09-16 20:30:58 +00005296
Thomas Wouters477c8d52006-05-27 19:21:47 +00005297 if (direction > 0)
5298 result = stringlib_find_slice(
5299 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5300 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5301 start, end
5302 );
5303 else
5304 result = stringlib_rfind_slice(
5305 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5306 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5307 start, end
5308 );
5309
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005311 Py_DECREF(sub);
5312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 return result;
5314}
5315
Tim Petersced69f82003-09-16 20:30:58 +00005316static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317int tailmatch(PyUnicodeObject *self,
5318 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005319 Py_ssize_t start,
5320 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 int direction)
5322{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 if (substring->length == 0)
5324 return 1;
5325
Thomas Wouters477c8d52006-05-27 19:21:47 +00005326 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327
5328 end -= substring->length;
5329 if (end < start)
5330 return 0;
5331
5332 if (direction > 0) {
5333 if (Py_UNICODE_MATCH(self, end, substring))
5334 return 1;
5335 } else {
5336 if (Py_UNICODE_MATCH(self, start, substring))
5337 return 1;
5338 }
5339
5340 return 0;
5341}
5342
Martin v. Löwis18e16552006-02-15 17:27:45 +00005343Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005345 Py_ssize_t start,
5346 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 int direction)
5348{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005349 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005350
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 str = PyUnicode_FromObject(str);
5352 if (str == NULL)
5353 return -1;
5354 substr = PyUnicode_FromObject(substr);
5355 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005356 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 return -1;
5358 }
Tim Petersced69f82003-09-16 20:30:58 +00005359
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 result = tailmatch((PyUnicodeObject *)str,
5361 (PyUnicodeObject *)substr,
5362 start, end, direction);
5363 Py_DECREF(str);
5364 Py_DECREF(substr);
5365 return result;
5366}
5367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368/* Apply fixfct filter to the Unicode object self and return a
5369 reference to the modified object */
5370
Tim Petersced69f82003-09-16 20:30:58 +00005371static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372PyObject *fixup(PyUnicodeObject *self,
5373 int (*fixfct)(PyUnicodeObject *s))
5374{
5375
5376 PyUnicodeObject *u;
5377
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005378 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 if (u == NULL)
5380 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005381
5382 Py_UNICODE_COPY(u->str, self->str, self->length);
5383
Tim Peters7a29bd52001-09-12 03:03:31 +00005384 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 /* fixfct should return TRUE if it modified the buffer. If
5386 FALSE, return a reference to the original buffer instead
5387 (to save space, not time) */
5388 Py_INCREF(self);
5389 Py_DECREF(u);
5390 return (PyObject*) self;
5391 }
5392 return (PyObject*) u;
5393}
5394
Tim Petersced69f82003-09-16 20:30:58 +00005395static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396int fixupper(PyUnicodeObject *self)
5397{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005398 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 Py_UNICODE *s = self->str;
5400 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 while (len-- > 0) {
5403 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005404
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 ch = Py_UNICODE_TOUPPER(*s);
5406 if (ch != *s) {
5407 status = 1;
5408 *s = ch;
5409 }
5410 s++;
5411 }
5412
5413 return status;
5414}
5415
Tim Petersced69f82003-09-16 20:30:58 +00005416static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417int fixlower(PyUnicodeObject *self)
5418{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005419 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 Py_UNICODE *s = self->str;
5421 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005422
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 while (len-- > 0) {
5424 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005425
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 ch = Py_UNICODE_TOLOWER(*s);
5427 if (ch != *s) {
5428 status = 1;
5429 *s = ch;
5430 }
5431 s++;
5432 }
5433
5434 return status;
5435}
5436
Tim Petersced69f82003-09-16 20:30:58 +00005437static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438int fixswapcase(PyUnicodeObject *self)
5439{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005440 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 Py_UNICODE *s = self->str;
5442 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005443
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 while (len-- > 0) {
5445 if (Py_UNICODE_ISUPPER(*s)) {
5446 *s = Py_UNICODE_TOLOWER(*s);
5447 status = 1;
5448 } else if (Py_UNICODE_ISLOWER(*s)) {
5449 *s = Py_UNICODE_TOUPPER(*s);
5450 status = 1;
5451 }
5452 s++;
5453 }
5454
5455 return status;
5456}
5457
Tim Petersced69f82003-09-16 20:30:58 +00005458static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459int fixcapitalize(PyUnicodeObject *self)
5460{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005461 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005462 Py_UNICODE *s = self->str;
5463 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005464
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005465 if (len == 0)
5466 return 0;
5467 if (Py_UNICODE_ISLOWER(*s)) {
5468 *s = Py_UNICODE_TOUPPER(*s);
5469 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005471 s++;
5472 while (--len > 0) {
5473 if (Py_UNICODE_ISUPPER(*s)) {
5474 *s = Py_UNICODE_TOLOWER(*s);
5475 status = 1;
5476 }
5477 s++;
5478 }
5479 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480}
5481
5482static
5483int fixtitle(PyUnicodeObject *self)
5484{
5485 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5486 register Py_UNICODE *e;
5487 int previous_is_cased;
5488
5489 /* Shortcut for single character strings */
5490 if (PyUnicode_GET_SIZE(self) == 1) {
5491 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5492 if (*p != ch) {
5493 *p = ch;
5494 return 1;
5495 }
5496 else
5497 return 0;
5498 }
Tim Petersced69f82003-09-16 20:30:58 +00005499
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 e = p + PyUnicode_GET_SIZE(self);
5501 previous_is_cased = 0;
5502 for (; p < e; p++) {
5503 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005504
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 if (previous_is_cased)
5506 *p = Py_UNICODE_TOLOWER(ch);
5507 else
5508 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005509
5510 if (Py_UNICODE_ISLOWER(ch) ||
5511 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 Py_UNICODE_ISTITLE(ch))
5513 previous_is_cased = 1;
5514 else
5515 previous_is_cased = 0;
5516 }
5517 return 1;
5518}
5519
Tim Peters8ce9f162004-08-27 01:49:32 +00005520PyObject *
5521PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522{
Tim Peters8ce9f162004-08-27 01:49:32 +00005523 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005524 const Py_UNICODE blank = ' ';
5525 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005526 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005527 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005528 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5529 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005530 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5531 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005532 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005533 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005534 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535
Tim Peters05eba1f2004-08-27 21:32:02 +00005536 fseq = PySequence_Fast(seq, "");
5537 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005538 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005539 }
5540
Tim Peters91879ab2004-08-27 22:35:44 +00005541 /* Grrrr. A codec may be invoked to convert str objects to
5542 * Unicode, and so it's possible to call back into Python code
5543 * during PyUnicode_FromObject(), and so it's possible for a sick
5544 * codec to change the size of fseq (if seq is a list). Therefore
5545 * we have to keep refetching the size -- can't assume seqlen
5546 * is invariant.
5547 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005548 seqlen = PySequence_Fast_GET_SIZE(fseq);
5549 /* If empty sequence, return u"". */
5550 if (seqlen == 0) {
5551 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5552 goto Done;
5553 }
5554 /* If singleton sequence with an exact Unicode, return that. */
5555 if (seqlen == 1) {
5556 item = PySequence_Fast_GET_ITEM(fseq, 0);
5557 if (PyUnicode_CheckExact(item)) {
5558 Py_INCREF(item);
5559 res = (PyUnicodeObject *)item;
5560 goto Done;
5561 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005562 }
5563
Tim Peters05eba1f2004-08-27 21:32:02 +00005564 /* At least two items to join, or one that isn't exact Unicode. */
5565 if (seqlen > 1) {
5566 /* Set up sep and seplen -- they're needed. */
5567 if (separator == NULL) {
5568 sep = &blank;
5569 seplen = 1;
5570 }
5571 else {
5572 internal_separator = PyUnicode_FromObject(separator);
5573 if (internal_separator == NULL)
5574 goto onError;
5575 sep = PyUnicode_AS_UNICODE(internal_separator);
5576 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005577 /* In case PyUnicode_FromObject() mutated seq. */
5578 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005579 }
5580 }
5581
5582 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005583 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005585 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005586 res_p = PyUnicode_AS_UNICODE(res);
5587 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005588
Tim Peters05eba1f2004-08-27 21:32:02 +00005589 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005590 Py_ssize_t itemlen;
5591 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005592
5593 item = PySequence_Fast_GET_ITEM(fseq, i);
5594 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005595 if (!PyUnicode_Check(item)) {
5596 PyErr_Format(PyExc_TypeError,
5597 "sequence item %zd: expected str instance,"
5598 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005599 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005600 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005601 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005602 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005603 if (item == NULL)
5604 goto onError;
5605 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005606
Tim Peters91879ab2004-08-27 22:35:44 +00005607 /* In case PyUnicode_FromObject() mutated seq. */
5608 seqlen = PySequence_Fast_GET_SIZE(fseq);
5609
Tim Peters8ce9f162004-08-27 01:49:32 +00005610 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005612 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005613 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005614 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005615 if (i < seqlen - 1) {
5616 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005617 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005618 goto Overflow;
5619 }
5620 if (new_res_used > res_alloc) {
5621 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005622 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005623 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005624 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005625 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005626 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005627 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005628 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005630 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005631 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005633
5634 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005635 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005636 res_p += itemlen;
5637 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005638 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005639 res_p += seplen;
5640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005642 res_used = new_res_used;
5643 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005644
Tim Peters05eba1f2004-08-27 21:32:02 +00005645 /* Shrink res to match the used area; this probably can't fail,
5646 * but it's cheap to check.
5647 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005648 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005649 goto onError;
5650
5651 Done:
5652 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005653 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 return (PyObject *)res;
5655
Tim Peters8ce9f162004-08-27 01:49:32 +00005656 Overflow:
5657 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005658 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005659 Py_DECREF(item);
5660 /* fall through */
5661
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005663 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005664 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005665 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 return NULL;
5667}
5668
Tim Petersced69f82003-09-16 20:30:58 +00005669static
5670PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005671 Py_ssize_t left,
5672 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 Py_UNICODE fill)
5674{
5675 PyUnicodeObject *u;
5676
5677 if (left < 0)
5678 left = 0;
5679 if (right < 0)
5680 right = 0;
5681
Tim Peters7a29bd52001-09-12 03:03:31 +00005682 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 Py_INCREF(self);
5684 return self;
5685 }
5686
5687 u = _PyUnicode_New(left + self->length + right);
5688 if (u) {
5689 if (left)
5690 Py_UNICODE_FILL(u->str, fill, left);
5691 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5692 if (right)
5693 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5694 }
5695
5696 return u;
5697}
5698
5699#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005700 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 if (!str) \
5702 goto onError; \
5703 if (PyList_Append(list, str)) { \
5704 Py_DECREF(str); \
5705 goto onError; \
5706 } \
5707 else \
5708 Py_DECREF(str);
5709
5710static
5711PyObject *split_whitespace(PyUnicodeObject *self,
5712 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005713 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005715 register Py_ssize_t i;
5716 register Py_ssize_t j;
5717 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005719 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720
5721 for (i = j = 0; i < len; ) {
5722 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005723 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 i++;
5725 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005726 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 i++;
5728 if (j < i) {
5729 if (maxcount-- <= 0)
5730 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005731 SPLIT_APPEND(buf, j, i);
5732 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 i++;
5734 j = i;
5735 }
5736 }
5737 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005738 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 }
5740 return list;
5741
5742 onError:
5743 Py_DECREF(list);
5744 return NULL;
5745}
5746
5747PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005748 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005750 register Py_ssize_t i;
5751 register Py_ssize_t j;
5752 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 PyObject *list;
5754 PyObject *str;
5755 Py_UNICODE *data;
5756
5757 string = PyUnicode_FromObject(string);
5758 if (string == NULL)
5759 return NULL;
5760 data = PyUnicode_AS_UNICODE(string);
5761 len = PyUnicode_GET_SIZE(string);
5762
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 list = PyList_New(0);
5764 if (!list)
5765 goto onError;
5766
5767 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005768 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005769
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005771 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
5774 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005775 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 if (i < len) {
5777 if (data[i] == '\r' && i + 1 < len &&
5778 data[i+1] == '\n')
5779 i += 2;
5780 else
5781 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005782 if (keepends)
5783 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 }
Guido van Rossum86662912000-04-11 15:38:46 +00005785 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 j = i;
5787 }
5788 if (j < len) {
5789 SPLIT_APPEND(data, j, len);
5790 }
5791
5792 Py_DECREF(string);
5793 return list;
5794
5795 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005796 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 Py_DECREF(string);
5798 return NULL;
5799}
5800
Tim Petersced69f82003-09-16 20:30:58 +00005801static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802PyObject *split_char(PyUnicodeObject *self,
5803 PyObject *list,
5804 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005805 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005807 register Py_ssize_t i;
5808 register Py_ssize_t j;
5809 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005811 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812
5813 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005814 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 if (maxcount-- <= 0)
5816 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005817 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 i = j = i + 1;
5819 } else
5820 i++;
5821 }
5822 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005823 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 }
5825 return list;
5826
5827 onError:
5828 Py_DECREF(list);
5829 return NULL;
5830}
5831
Tim Petersced69f82003-09-16 20:30:58 +00005832static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833PyObject *split_substring(PyUnicodeObject *self,
5834 PyObject *list,
5835 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005836 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005838 register Py_ssize_t i;
5839 register Py_ssize_t j;
5840 Py_ssize_t len = self->length;
5841 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 PyObject *str;
5843
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005844 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 if (Py_UNICODE_MATCH(self, i, substring)) {
5846 if (maxcount-- <= 0)
5847 break;
5848 SPLIT_APPEND(self->str, j, i);
5849 i = j = i + sublen;
5850 } else
5851 i++;
5852 }
5853 if (j <= len) {
5854 SPLIT_APPEND(self->str, j, len);
5855 }
5856 return list;
5857
5858 onError:
5859 Py_DECREF(list);
5860 return NULL;
5861}
5862
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005863static
5864PyObject *rsplit_whitespace(PyUnicodeObject *self,
5865 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 register Py_ssize_t i;
5869 register Py_ssize_t j;
5870 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005871 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005872 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005873
5874 for (i = j = len - 1; i >= 0; ) {
5875 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005876 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005877 i--;
5878 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005879 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005880 i--;
5881 if (j > i) {
5882 if (maxcount-- <= 0)
5883 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005884 SPLIT_APPEND(buf, i + 1, j + 1);
5885 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005886 i--;
5887 j = i;
5888 }
5889 }
5890 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005891 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005892 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005893 if (PyList_Reverse(list) < 0)
5894 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005895 return list;
5896
5897 onError:
5898 Py_DECREF(list);
5899 return NULL;
5900}
5901
5902static
5903PyObject *rsplit_char(PyUnicodeObject *self,
5904 PyObject *list,
5905 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005906 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005907{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005908 register Py_ssize_t i;
5909 register Py_ssize_t j;
5910 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005911 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005912 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005913
5914 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005915 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005916 if (maxcount-- <= 0)
5917 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005918 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005919 j = i = i - 1;
5920 } else
5921 i--;
5922 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005923 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005924 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005925 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005926 if (PyList_Reverse(list) < 0)
5927 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005928 return list;
5929
5930 onError:
5931 Py_DECREF(list);
5932 return NULL;
5933}
5934
5935static
5936PyObject *rsplit_substring(PyUnicodeObject *self,
5937 PyObject *list,
5938 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005939 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005940{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005941 register Py_ssize_t i;
5942 register Py_ssize_t j;
5943 Py_ssize_t len = self->length;
5944 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005945 PyObject *str;
5946
5947 for (i = len - sublen, j = len; i >= 0; ) {
5948 if (Py_UNICODE_MATCH(self, i, substring)) {
5949 if (maxcount-- <= 0)
5950 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005951 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005952 j = i;
5953 i -= sublen;
5954 } else
5955 i--;
5956 }
5957 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005958 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005959 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005960 if (PyList_Reverse(list) < 0)
5961 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005962 return list;
5963
5964 onError:
5965 Py_DECREF(list);
5966 return NULL;
5967}
5968
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969#undef SPLIT_APPEND
5970
5971static
5972PyObject *split(PyUnicodeObject *self,
5973 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005974 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975{
5976 PyObject *list;
5977
5978 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005979 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
5981 list = PyList_New(0);
5982 if (!list)
5983 return NULL;
5984
5985 if (substring == NULL)
5986 return split_whitespace(self,list,maxcount);
5987
5988 else if (substring->length == 1)
5989 return split_char(self,list,substring->str[0],maxcount);
5990
5991 else if (substring->length == 0) {
5992 Py_DECREF(list);
5993 PyErr_SetString(PyExc_ValueError, "empty separator");
5994 return NULL;
5995 }
5996 else
5997 return split_substring(self,list,substring,maxcount);
5998}
5999
Tim Petersced69f82003-09-16 20:30:58 +00006000static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006001PyObject *rsplit(PyUnicodeObject *self,
6002 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006003 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006004{
6005 PyObject *list;
6006
6007 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006008 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006009
6010 list = PyList_New(0);
6011 if (!list)
6012 return NULL;
6013
6014 if (substring == NULL)
6015 return rsplit_whitespace(self,list,maxcount);
6016
6017 else if (substring->length == 1)
6018 return rsplit_char(self,list,substring->str[0],maxcount);
6019
6020 else if (substring->length == 0) {
6021 Py_DECREF(list);
6022 PyErr_SetString(PyExc_ValueError, "empty separator");
6023 return NULL;
6024 }
6025 else
6026 return rsplit_substring(self,list,substring,maxcount);
6027}
6028
6029static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030PyObject *replace(PyUnicodeObject *self,
6031 PyUnicodeObject *str1,
6032 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006033 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034{
6035 PyUnicodeObject *u;
6036
6037 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006038 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039
Thomas Wouters477c8d52006-05-27 19:21:47 +00006040 if (str1->length == str2->length) {
6041 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006042 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006043 if (str1->length == 1) {
6044 /* replace characters */
6045 Py_UNICODE u1, u2;
6046 if (!findchar(self->str, self->length, str1->str[0]))
6047 goto nothing;
6048 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6049 if (!u)
6050 return NULL;
6051 Py_UNICODE_COPY(u->str, self->str, self->length);
6052 u1 = str1->str[0];
6053 u2 = str2->str[0];
6054 for (i = 0; i < u->length; i++)
6055 if (u->str[i] == u1) {
6056 if (--maxcount < 0)
6057 break;
6058 u->str[i] = u2;
6059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006061 i = fastsearch(
6062 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006064 if (i < 0)
6065 goto nothing;
6066 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6067 if (!u)
6068 return NULL;
6069 Py_UNICODE_COPY(u->str, self->str, self->length);
6070 while (i <= self->length - str1->length)
6071 if (Py_UNICODE_MATCH(self, i, str1)) {
6072 if (--maxcount < 0)
6073 break;
6074 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6075 i += str1->length;
6076 } else
6077 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006080
6081 Py_ssize_t n, i, j, e;
6082 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 Py_UNICODE *p;
6084
6085 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006086 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 if (n > maxcount)
6088 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006089 if (n == 0)
6090 goto nothing;
6091 /* new_size = self->length + n * (str2->length - str1->length)); */
6092 delta = (str2->length - str1->length);
6093 if (delta == 0) {
6094 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006096 product = n * (str2->length - str1->length);
6097 if ((product / (str2->length - str1->length)) != n) {
6098 PyErr_SetString(PyExc_OverflowError,
6099 "replace string is too long");
6100 return NULL;
6101 }
6102 new_size = self->length + product;
6103 if (new_size < 0) {
6104 PyErr_SetString(PyExc_OverflowError,
6105 "replace string is too long");
6106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 }
6108 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006109 u = _PyUnicode_New(new_size);
6110 if (!u)
6111 return NULL;
6112 i = 0;
6113 p = u->str;
6114 e = self->length - str1->length;
6115 if (str1->length > 0) {
6116 while (n-- > 0) {
6117 /* look for next match */
6118 j = i;
6119 while (j <= e) {
6120 if (Py_UNICODE_MATCH(self, j, str1))
6121 break;
6122 j++;
6123 }
6124 if (j > i) {
6125 if (j > e)
6126 break;
6127 /* copy unchanged part [i:j] */
6128 Py_UNICODE_COPY(p, self->str+i, j-i);
6129 p += j - i;
6130 }
6131 /* copy substitution string */
6132 if (str2->length > 0) {
6133 Py_UNICODE_COPY(p, str2->str, str2->length);
6134 p += str2->length;
6135 }
6136 i = j + str1->length;
6137 }
6138 if (i < self->length)
6139 /* copy tail [i:] */
6140 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6141 } else {
6142 /* interleave */
6143 while (n > 0) {
6144 Py_UNICODE_COPY(p, str2->str, str2->length);
6145 p += str2->length;
6146 if (--n <= 0)
6147 break;
6148 *p++ = self->str[i++];
6149 }
6150 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006154
6155nothing:
6156 /* nothing to replace; return original string (when possible) */
6157 if (PyUnicode_CheckExact(self)) {
6158 Py_INCREF(self);
6159 return (PyObject *) self;
6160 }
6161 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162}
6163
6164/* --- Unicode Object Methods --------------------------------------------- */
6165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006166PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167"S.title() -> unicode\n\
6168\n\
6169Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006170characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
6172static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006173unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 return fixup(self, fixtitle);
6176}
6177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006178PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179"S.capitalize() -> unicode\n\
6180\n\
6181Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006182have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
6184static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006185unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 return fixup(self, fixcapitalize);
6188}
6189
6190#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006191PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192"S.capwords() -> unicode\n\
6193\n\
6194Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006195normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
6197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006198unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199{
6200 PyObject *list;
6201 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006202 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 /* Split into words */
6205 list = split(self, NULL, -1);
6206 if (!list)
6207 return NULL;
6208
6209 /* Capitalize each word */
6210 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6211 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6212 fixcapitalize);
6213 if (item == NULL)
6214 goto onError;
6215 Py_DECREF(PyList_GET_ITEM(list, i));
6216 PyList_SET_ITEM(list, i, item);
6217 }
6218
6219 /* Join the words to form a new string */
6220 item = PyUnicode_Join(NULL, list);
6221
6222onError:
6223 Py_DECREF(list);
6224 return (PyObject *)item;
6225}
6226#endif
6227
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006228/* Argument converter. Coerces to a single unicode character */
6229
6230static int
6231convert_uc(PyObject *obj, void *addr)
6232{
6233 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6234 PyObject *uniobj;
6235 Py_UNICODE *unistr;
6236
6237 uniobj = PyUnicode_FromObject(obj);
6238 if (uniobj == NULL) {
6239 PyErr_SetString(PyExc_TypeError,
6240 "The fill character cannot be converted to Unicode");
6241 return 0;
6242 }
6243 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6244 PyErr_SetString(PyExc_TypeError,
6245 "The fill character must be exactly one character long");
6246 Py_DECREF(uniobj);
6247 return 0;
6248 }
6249 unistr = PyUnicode_AS_UNICODE(uniobj);
6250 *fillcharloc = unistr[0];
6251 Py_DECREF(uniobj);
6252 return 1;
6253}
6254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006255PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006256"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006258Return S centered in a Unicode string of length width. Padding is\n\
6259done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
6261static PyObject *
6262unicode_center(PyUnicodeObject *self, PyObject *args)
6263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006264 Py_ssize_t marg, left;
6265 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006266 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267
Thomas Woutersde017742006-02-16 19:34:37 +00006268 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 return NULL;
6270
Tim Peters7a29bd52001-09-12 03:03:31 +00006271 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 Py_INCREF(self);
6273 return (PyObject*) self;
6274 }
6275
6276 marg = width - self->length;
6277 left = marg / 2 + (marg & width & 1);
6278
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006279 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280}
6281
Marc-André Lemburge5034372000-08-08 08:04:29 +00006282#if 0
6283
6284/* This code should go into some future Unicode collation support
6285 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006286 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006287
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006288/* speedy UTF-16 code point order comparison */
6289/* gleaned from: */
6290/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6291
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006292static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006293{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006294 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006295 0, 0, 0, 0, 0, 0, 0, 0,
6296 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006297 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006298};
6299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300static int
6301unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006303 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006304
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 Py_UNICODE *s1 = str1->str;
6306 Py_UNICODE *s2 = str2->str;
6307
6308 len1 = str1->length;
6309 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006310
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006312 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006313
6314 c1 = *s1++;
6315 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006316
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006317 if (c1 > (1<<11) * 26)
6318 c1 += utf16Fixup[c1>>11];
6319 if (c2 > (1<<11) * 26)
6320 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006321 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006322
6323 if (c1 != c2)
6324 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006325
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006326 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 }
6328
6329 return (len1 < len2) ? -1 : (len1 != len2);
6330}
6331
Marc-André Lemburge5034372000-08-08 08:04:29 +00006332#else
6333
6334static int
6335unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6336{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006337 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006338
6339 Py_UNICODE *s1 = str1->str;
6340 Py_UNICODE *s2 = str2->str;
6341
6342 len1 = str1->length;
6343 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006344
Marc-André Lemburge5034372000-08-08 08:04:29 +00006345 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006346 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006347
Fredrik Lundh45714e92001-06-26 16:39:36 +00006348 c1 = *s1++;
6349 c2 = *s2++;
6350
6351 if (c1 != c2)
6352 return (c1 < c2) ? -1 : 1;
6353
Marc-André Lemburge5034372000-08-08 08:04:29 +00006354 len1--; len2--;
6355 }
6356
6357 return (len1 < len2) ? -1 : (len1 != len2);
6358}
6359
6360#endif
6361
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362int PyUnicode_Compare(PyObject *left,
6363 PyObject *right)
6364{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006365 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6366 return unicode_compare((PyUnicodeObject *)left,
6367 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006368 PyErr_Format(PyExc_TypeError,
6369 "Can't compare %.100s and %.100s",
6370 left->ob_type->tp_name,
6371 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 return -1;
6373}
6374
Martin v. Löwis5b222132007-06-10 09:51:05 +00006375int
6376PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6377{
6378 int i;
6379 Py_UNICODE *id;
6380 assert(PyUnicode_Check(uni));
6381 id = PyUnicode_AS_UNICODE(uni);
6382 /* Compare Unicode string and source character set string */
6383 for (i = 0; id[i] && str[i]; i++)
6384 if (id[i] != str[i])
6385 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6386 if (id[i])
6387 return 1; /* uni is longer */
6388 if (str[i])
6389 return -1; /* str is longer */
6390 return 0;
6391}
6392
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006393PyObject *PyUnicode_RichCompare(PyObject *left,
6394 PyObject *right,
6395 int op)
6396{
6397 int result;
6398
6399 result = PyUnicode_Compare(left, right);
6400 if (result == -1 && PyErr_Occurred())
6401 goto onError;
6402
6403 /* Convert the return value to a Boolean */
6404 switch (op) {
6405 case Py_EQ:
6406 result = (result == 0);
6407 break;
6408 case Py_NE:
6409 result = (result != 0);
6410 break;
6411 case Py_LE:
6412 result = (result <= 0);
6413 break;
6414 case Py_GE:
6415 result = (result >= 0);
6416 break;
6417 case Py_LT:
6418 result = (result == -1);
6419 break;
6420 case Py_GT:
6421 result = (result == 1);
6422 break;
6423 }
6424 return PyBool_FromLong(result);
6425
6426 onError:
6427
6428 /* Standard case
6429
6430 Type errors mean that PyUnicode_FromObject() could not convert
6431 one of the arguments (usually the right hand side) to Unicode,
6432 ie. we can't handle the comparison request. However, it is
6433 possible that the other object knows a comparison method, which
6434 is why we return Py_NotImplemented to give the other object a
6435 chance.
6436
6437 */
6438 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6439 PyErr_Clear();
6440 Py_INCREF(Py_NotImplemented);
6441 return Py_NotImplemented;
6442 }
6443 if (op != Py_EQ && op != Py_NE)
6444 return NULL;
6445
6446 /* Equality comparison.
6447
6448 This is a special case: we silence any PyExc_UnicodeDecodeError
6449 and instead turn it into a PyErr_UnicodeWarning.
6450
6451 */
6452 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6453 return NULL;
6454 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006455 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6456 (op == Py_EQ) ?
6457 "Unicode equal comparison "
6458 "failed to convert both arguments to Unicode - "
6459 "interpreting them as being unequal"
6460 :
6461 "Unicode unequal comparison "
6462 "failed to convert both arguments to Unicode - "
6463 "interpreting them as being unequal",
6464 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006465 return NULL;
6466 result = (op == Py_NE);
6467 return PyBool_FromLong(result);
6468}
6469
Guido van Rossum403d68b2000-03-13 15:55:09 +00006470int PyUnicode_Contains(PyObject *container,
6471 PyObject *element)
6472{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006473 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006474 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006475
6476 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 sub = PyUnicode_FromObject(element);
6478 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006479 PyErr_Format(PyExc_TypeError,
6480 "'in <string>' requires string as left operand, not %s",
6481 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006483 }
6484
Thomas Wouters477c8d52006-05-27 19:21:47 +00006485 str = PyUnicode_FromObject(container);
6486 if (!str) {
6487 Py_DECREF(sub);
6488 return -1;
6489 }
6490
6491 result = stringlib_contains_obj(str, sub);
6492
6493 Py_DECREF(str);
6494 Py_DECREF(sub);
6495
Guido van Rossum403d68b2000-03-13 15:55:09 +00006496 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006497}
6498
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499/* Concat to string or Unicode object giving a new Unicode object. */
6500
6501PyObject *PyUnicode_Concat(PyObject *left,
6502 PyObject *right)
6503{
6504 PyUnicodeObject *u = NULL, *v = NULL, *w;
6505
6506 /* Coerce the two arguments */
6507 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6508 if (u == NULL)
6509 goto onError;
6510 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6511 if (v == NULL)
6512 goto onError;
6513
6514 /* Shortcuts */
6515 if (v == unicode_empty) {
6516 Py_DECREF(v);
6517 return (PyObject *)u;
6518 }
6519 if (u == unicode_empty) {
6520 Py_DECREF(u);
6521 return (PyObject *)v;
6522 }
6523
6524 /* Concat the two Unicode strings */
6525 w = _PyUnicode_New(u->length + v->length);
6526 if (w == NULL)
6527 goto onError;
6528 Py_UNICODE_COPY(w->str, u->str, u->length);
6529 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6530
6531 Py_DECREF(u);
6532 Py_DECREF(v);
6533 return (PyObject *)w;
6534
6535onError:
6536 Py_XDECREF(u);
6537 Py_XDECREF(v);
6538 return NULL;
6539}
6540
Walter Dörwald1ab83302007-05-18 17:15:44 +00006541void
6542PyUnicode_Append(PyObject **pleft, PyObject *right)
6543{
6544 PyObject *new;
6545 if (*pleft == NULL)
6546 return;
6547 if (right == NULL || !PyUnicode_Check(*pleft)) {
6548 Py_DECREF(*pleft);
6549 *pleft = NULL;
6550 return;
6551 }
6552 new = PyUnicode_Concat(*pleft, right);
6553 Py_DECREF(*pleft);
6554 *pleft = new;
6555}
6556
6557void
6558PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6559{
6560 PyUnicode_Append(pleft, right);
6561 Py_XDECREF(right);
6562}
6563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006564PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565"S.count(sub[, start[, end]]) -> int\n\
6566\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006567Return the number of non-overlapping occurrences of substring sub in\n\
6568Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006569interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570
6571static PyObject *
6572unicode_count(PyUnicodeObject *self, PyObject *args)
6573{
6574 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006575 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006576 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 PyObject *result;
6578
Guido van Rossumb8872e62000-05-09 14:14:27 +00006579 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6580 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 return NULL;
6582
6583 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 if (substring == NULL)
6586 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006587
Thomas Wouters477c8d52006-05-27 19:21:47 +00006588 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
Christian Heimes217cfd12007-12-02 14:31:20 +00006590 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591 stringlib_count(self->str + start, end - start,
6592 substring->str, substring->length)
6593 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594
6595 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006596
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 return result;
6598}
6599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006601"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006603Encodes S using the codec registered for encoding. encoding defaults\n\
6604to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006605handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6607'xmlcharrefreplace' as well as any other name registered with\n\
6608codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
6610static PyObject *
6611unicode_encode(PyUnicodeObject *self, PyObject *args)
6612{
6613 char *encoding = NULL;
6614 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006615 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6618 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006619 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006620 if (v == NULL)
6621 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006622 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006623 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006624 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006625 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006626 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006627 Py_DECREF(v);
6628 return NULL;
6629 }
6630 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006631
6632 onError:
6633 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006634}
6635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006636PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637"S.expandtabs([tabsize]) -> unicode\n\
6638\n\
6639Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006640If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
6642static PyObject*
6643unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6644{
6645 Py_UNICODE *e;
6646 Py_UNICODE *p;
6647 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006648 Py_UNICODE *qe;
6649 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 PyUnicodeObject *u;
6651 int tabsize = 8;
6652
6653 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6654 return NULL;
6655
Thomas Wouters7e474022000-07-16 12:04:32 +00006656 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006657 i = 0; /* chars up to and including most recent \n or \r */
6658 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6659 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 for (p = self->str; p < e; p++)
6661 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006662 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006663 incr = tabsize - (j % tabsize); /* cannot overflow */
6664 if (j > PY_SSIZE_T_MAX - incr)
6665 goto overflow1;
6666 j += incr;
6667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 }
6669 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006670 if (j > PY_SSIZE_T_MAX - 1)
6671 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 j++;
6673 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006674 if (i > PY_SSIZE_T_MAX - j)
6675 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006677 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 }
6679 }
6680
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006681 if (i > PY_SSIZE_T_MAX - j)
6682 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006683
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 /* Second pass: create output string and fill it */
6685 u = _PyUnicode_New(i + j);
6686 if (!u)
6687 return NULL;
6688
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006689 j = 0; /* same as in first pass */
6690 q = u->str; /* next output char */
6691 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
6693 for (p = self->str; p < e; p++)
6694 if (*p == '\t') {
6695 if (tabsize > 0) {
6696 i = tabsize - (j % tabsize);
6697 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006698 while (i--) {
6699 if (q >= qe)
6700 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 }
6704 }
6705 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006706 if (q >= qe)
6707 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006709 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 if (*p == '\n' || *p == '\r')
6711 j = 0;
6712 }
6713
6714 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006715
6716 overflow2:
6717 Py_DECREF(u);
6718 overflow1:
6719 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721}
6722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006723PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724"S.find(sub [,start [,end]]) -> int\n\
6725\n\
6726Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006727such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728arguments start and end are interpreted as in slice notation.\n\
6729\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006730Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731
6732static PyObject *
6733unicode_find(PyUnicodeObject *self, PyObject *args)
6734{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006735 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006736 Py_ssize_t start;
6737 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006738 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739
Christian Heimes9cd17752007-11-18 19:35:23 +00006740 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742
Thomas Wouters477c8d52006-05-27 19:21:47 +00006743 result = stringlib_find_slice(
6744 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6745 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6746 start, end
6747 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
6749 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006750
Christian Heimes217cfd12007-12-02 14:31:20 +00006751 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752}
6753
6754static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006755unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
6757 if (index < 0 || index >= self->length) {
6758 PyErr_SetString(PyExc_IndexError, "string index out of range");
6759 return NULL;
6760 }
6761
6762 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6763}
6764
Guido van Rossumc2504932007-09-18 19:42:40 +00006765/* Believe it or not, this produces the same value for ASCII strings
6766 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006768unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769{
Guido van Rossumc2504932007-09-18 19:42:40 +00006770 Py_ssize_t len;
6771 Py_UNICODE *p;
6772 long x;
6773
6774 if (self->hash != -1)
6775 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006776 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006777 p = self->str;
6778 x = *p << 7;
6779 while (--len >= 0)
6780 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006781 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006782 if (x == -1)
6783 x = -2;
6784 self->hash = x;
6785 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786}
6787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006788PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789"S.index(sub [,start [,end]]) -> int\n\
6790\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006791Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792
6793static PyObject *
6794unicode_index(PyUnicodeObject *self, PyObject *args)
6795{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006796 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006797 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006798 Py_ssize_t start;
6799 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800
Christian Heimes9cd17752007-11-18 19:35:23 +00006801 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
Thomas Wouters477c8d52006-05-27 19:21:47 +00006804 result = stringlib_find_slice(
6805 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6806 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6807 start, end
6808 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809
6810 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006811
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 if (result < 0) {
6813 PyErr_SetString(PyExc_ValueError, "substring not found");
6814 return NULL;
6815 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006816
Christian Heimes217cfd12007-12-02 14:31:20 +00006817 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818}
6819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006820PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006821"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006823Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006824at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825
6826static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006827unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828{
6829 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6830 register const Py_UNICODE *e;
6831 int cased;
6832
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 /* Shortcut for single character strings */
6834 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006835 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006837 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006838 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006839 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006840
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 e = p + PyUnicode_GET_SIZE(self);
6842 cased = 0;
6843 for (; p < e; p++) {
6844 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006845
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 else if (!cased && Py_UNICODE_ISLOWER(ch))
6849 cased = 1;
6850 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006851 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852}
6853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006854PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006855"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006857Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006858at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859
6860static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006861unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862{
6863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864 register const Py_UNICODE *e;
6865 int cased;
6866
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 /* Shortcut for single character strings */
6868 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006871 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006872 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006873 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006874
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 e = p + PyUnicode_GET_SIZE(self);
6876 cased = 0;
6877 for (; p < e; p++) {
6878 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006879
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006881 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 else if (!cased && Py_UNICODE_ISUPPER(ch))
6883 cased = 1;
6884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006885 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886}
6887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006888PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006889"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006891Return True if S is a titlecased string and there is at least one\n\
6892character in S, i.e. upper- and titlecase characters may only\n\
6893follow uncased characters and lowercase characters only cased ones.\n\
6894Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895
6896static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006897unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898{
6899 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6900 register const Py_UNICODE *e;
6901 int cased, previous_is_cased;
6902
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 /* Shortcut for single character strings */
6904 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6906 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006908 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006909 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006911
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 e = p + PyUnicode_GET_SIZE(self);
6913 cased = 0;
6914 previous_is_cased = 0;
6915 for (; p < e; p++) {
6916 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006917
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6919 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 previous_is_cased = 1;
6922 cased = 1;
6923 }
6924 else if (Py_UNICODE_ISLOWER(ch)) {
6925 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006926 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 previous_is_cased = 1;
6928 cased = 1;
6929 }
6930 else
6931 previous_is_cased = 0;
6932 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006933 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934}
6935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006936PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006937"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006939Return True if all characters in S are whitespace\n\
6940and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941
6942static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006943unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944{
6945 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6946 register const Py_UNICODE *e;
6947
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 /* Shortcut for single character strings */
6949 if (PyUnicode_GET_SIZE(self) == 1 &&
6950 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006951 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006953 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006954 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006955 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006956
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957 e = p + PyUnicode_GET_SIZE(self);
6958 for (; p < e; p++) {
6959 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006960 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963}
6964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006965PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006966"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006967\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006968Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006969and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006970
6971static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006972unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006973{
6974 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6975 register const Py_UNICODE *e;
6976
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006977 /* Shortcut for single character strings */
6978 if (PyUnicode_GET_SIZE(self) == 1 &&
6979 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006980 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006981
6982 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006983 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006984 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006985
6986 e = p + PyUnicode_GET_SIZE(self);
6987 for (; p < e; p++) {
6988 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006989 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006990 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006991 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006992}
6993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006994PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006995"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006996\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006997Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006998and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006999
7000static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007001unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007002{
7003 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7004 register const Py_UNICODE *e;
7005
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007006 /* Shortcut for single character strings */
7007 if (PyUnicode_GET_SIZE(self) == 1 &&
7008 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007010
7011 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007012 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007013 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007014
7015 e = p + PyUnicode_GET_SIZE(self);
7016 for (; p < e; p++) {
7017 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007018 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007019 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007020 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007021}
7022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007023PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007024"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007026Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007027False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
7029static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007030unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
7032 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7033 register const Py_UNICODE *e;
7034
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 /* Shortcut for single character strings */
7036 if (PyUnicode_GET_SIZE(self) == 1 &&
7037 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007038 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007040 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007041 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007042 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007043
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 e = p + PyUnicode_GET_SIZE(self);
7045 for (; p < e; p++) {
7046 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007047 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007049 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050}
7051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007052PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007053"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007055Return True if all characters in S are digits\n\
7056and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057
7058static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007059unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060{
7061 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7062 register const Py_UNICODE *e;
7063
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 /* Shortcut for single character strings */
7065 if (PyUnicode_GET_SIZE(self) == 1 &&
7066 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007067 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007069 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007070 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007071 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007072
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 e = p + PyUnicode_GET_SIZE(self);
7074 for (; p < e; p++) {
7075 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007076 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007078 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079}
7080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007081PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007082"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007084Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007085False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086
7087static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007088unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089{
7090 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7091 register const Py_UNICODE *e;
7092
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093 /* Shortcut for single character strings */
7094 if (PyUnicode_GET_SIZE(self) == 1 &&
7095 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007096 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007098 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007099 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007100 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007101
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 e = p + PyUnicode_GET_SIZE(self);
7103 for (; p < e; p++) {
7104 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007105 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007107 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108}
7109
Martin v. Löwis47383402007-08-15 07:32:56 +00007110int
7111PyUnicode_IsIdentifier(PyObject *self)
7112{
7113 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7114 register const Py_UNICODE *e;
7115
7116 /* Special case for empty strings */
7117 if (PyUnicode_GET_SIZE(self) == 0)
7118 return 0;
7119
7120 /* PEP 3131 says that the first character must be in
7121 XID_Start and subsequent characters in XID_Continue,
7122 and for the ASCII range, the 2.x rules apply (i.e
7123 start with letters and underscore, continue with
7124 letters, digits, underscore). However, given the current
7125 definition of XID_Start and XID_Continue, it is sufficient
7126 to check just for these, except that _ must be allowed
7127 as starting an identifier. */
7128 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7129 return 0;
7130
7131 e = p + PyUnicode_GET_SIZE(self);
7132 for (p++; p < e; p++) {
7133 if (!_PyUnicode_IsXidContinue(*p))
7134 return 0;
7135 }
7136 return 1;
7137}
7138
7139PyDoc_STRVAR(isidentifier__doc__,
7140"S.isidentifier() -> bool\n\
7141\n\
7142Return True if S is a valid identifier according\n\
7143to the language definition.");
7144
7145static PyObject*
7146unicode_isidentifier(PyObject *self)
7147{
7148 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7149}
7150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007151PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152"S.join(sequence) -> unicode\n\
7153\n\
7154Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007155sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
7157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007158unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007160 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161}
7162
Martin v. Löwis18e16552006-02-15 17:27:45 +00007163static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164unicode_length(PyUnicodeObject *self)
7165{
7166 return self->length;
7167}
7168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007169PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007170"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171\n\
7172Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007173done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174
7175static PyObject *
7176unicode_ljust(PyUnicodeObject *self, PyObject *args)
7177{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007178 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007179 Py_UNICODE fillchar = ' ';
7180
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007181 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 return NULL;
7183
Tim Peters7a29bd52001-09-12 03:03:31 +00007184 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 Py_INCREF(self);
7186 return (PyObject*) self;
7187 }
7188
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007189 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190}
7191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007192PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193"S.lower() -> unicode\n\
7194\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007195Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
7197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007198unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 return fixup(self, fixlower);
7201}
7202
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007203#define LEFTSTRIP 0
7204#define RIGHTSTRIP 1
7205#define BOTHSTRIP 2
7206
7207/* Arrays indexed by above */
7208static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7209
7210#define STRIPNAME(i) (stripformat[i]+3)
7211
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007212/* externally visible for str.strip(unicode) */
7213PyObject *
7214_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7215{
7216 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007217 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007218 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007219 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7220 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007221
Thomas Wouters477c8d52006-05-27 19:21:47 +00007222 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7223
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007224 i = 0;
7225 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007226 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7227 i++;
7228 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007229 }
7230
7231 j = len;
7232 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007233 do {
7234 j--;
7235 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7236 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007237 }
7238
7239 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007240 Py_INCREF(self);
7241 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007242 }
7243 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007244 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007245}
7246
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247
7248static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007249do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007251 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007252 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007253
7254 i = 0;
7255 if (striptype != RIGHTSTRIP) {
7256 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7257 i++;
7258 }
7259 }
7260
7261 j = len;
7262 if (striptype != LEFTSTRIP) {
7263 do {
7264 j--;
7265 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7266 j++;
7267 }
7268
7269 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7270 Py_INCREF(self);
7271 return (PyObject*)self;
7272 }
7273 else
7274 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275}
7276
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007277
7278static PyObject *
7279do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7280{
7281 PyObject *sep = NULL;
7282
7283 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7284 return NULL;
7285
7286 if (sep != NULL && sep != Py_None) {
7287 if (PyUnicode_Check(sep))
7288 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007289 else {
7290 PyErr_Format(PyExc_TypeError,
7291 "%s arg must be None, unicode or str",
7292 STRIPNAME(striptype));
7293 return NULL;
7294 }
7295 }
7296
7297 return do_strip(self, striptype);
7298}
7299
7300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007301PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007302"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007303\n\
7304Return a copy of the string S with leading and trailing\n\
7305whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007306If chars is given and not None, remove characters in chars instead.\n\
7307If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007308
7309static PyObject *
7310unicode_strip(PyUnicodeObject *self, PyObject *args)
7311{
7312 if (PyTuple_GET_SIZE(args) == 0)
7313 return do_strip(self, BOTHSTRIP); /* Common case */
7314 else
7315 return do_argstrip(self, BOTHSTRIP, args);
7316}
7317
7318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007319PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007320"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007321\n\
7322Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007323If chars is given and not None, remove characters in chars instead.\n\
7324If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007325
7326static PyObject *
7327unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7328{
7329 if (PyTuple_GET_SIZE(args) == 0)
7330 return do_strip(self, LEFTSTRIP); /* Common case */
7331 else
7332 return do_argstrip(self, LEFTSTRIP, args);
7333}
7334
7335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007336PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007337"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007338\n\
7339Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007340If chars is given and not None, remove characters in chars instead.\n\
7341If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007342
7343static PyObject *
7344unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7345{
7346 if (PyTuple_GET_SIZE(args) == 0)
7347 return do_strip(self, RIGHTSTRIP); /* Common case */
7348 else
7349 return do_argstrip(self, RIGHTSTRIP, args);
7350}
7351
7352
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007354unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355{
7356 PyUnicodeObject *u;
7357 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007358 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007359 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360
7361 if (len < 0)
7362 len = 0;
7363
Tim Peters7a29bd52001-09-12 03:03:31 +00007364 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 /* no repeat, return original string */
7366 Py_INCREF(str);
7367 return (PyObject*) str;
7368 }
Tim Peters8f422462000-09-09 06:13:41 +00007369
7370 /* ensure # of chars needed doesn't overflow int and # of bytes
7371 * needed doesn't overflow size_t
7372 */
7373 nchars = len * str->length;
7374 if (len && nchars / len != str->length) {
7375 PyErr_SetString(PyExc_OverflowError,
7376 "repeated string is too long");
7377 return NULL;
7378 }
7379 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7380 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7381 PyErr_SetString(PyExc_OverflowError,
7382 "repeated string is too long");
7383 return NULL;
7384 }
7385 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 if (!u)
7387 return NULL;
7388
7389 p = u->str;
7390
Thomas Wouters477c8d52006-05-27 19:21:47 +00007391 if (str->length == 1 && len > 0) {
7392 Py_UNICODE_FILL(p, str->str[0], len);
7393 } else {
7394 Py_ssize_t done = 0; /* number of characters copied this far */
7395 if (done < nchars) {
7396 Py_UNICODE_COPY(p, str->str, str->length);
7397 done = str->length;
7398 }
7399 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007400 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007401 Py_UNICODE_COPY(p+done, p, n);
7402 done += n;
7403 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 }
7405
7406 return (PyObject*) u;
7407}
7408
7409PyObject *PyUnicode_Replace(PyObject *obj,
7410 PyObject *subobj,
7411 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007412 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413{
7414 PyObject *self;
7415 PyObject *str1;
7416 PyObject *str2;
7417 PyObject *result;
7418
7419 self = PyUnicode_FromObject(obj);
7420 if (self == NULL)
7421 return NULL;
7422 str1 = PyUnicode_FromObject(subobj);
7423 if (str1 == NULL) {
7424 Py_DECREF(self);
7425 return NULL;
7426 }
7427 str2 = PyUnicode_FromObject(replobj);
7428 if (str2 == NULL) {
7429 Py_DECREF(self);
7430 Py_DECREF(str1);
7431 return NULL;
7432 }
Tim Petersced69f82003-09-16 20:30:58 +00007433 result = replace((PyUnicodeObject *)self,
7434 (PyUnicodeObject *)str1,
7435 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 maxcount);
7437 Py_DECREF(self);
7438 Py_DECREF(str1);
7439 Py_DECREF(str2);
7440 return result;
7441}
7442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007443PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444"S.replace (old, new[, maxsplit]) -> unicode\n\
7445\n\
7446Return a copy of S with all occurrences of substring\n\
7447old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007448given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449
7450static PyObject*
7451unicode_replace(PyUnicodeObject *self, PyObject *args)
7452{
7453 PyUnicodeObject *str1;
7454 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007455 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 PyObject *result;
7457
Martin v. Löwis18e16552006-02-15 17:27:45 +00007458 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 return NULL;
7460 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7461 if (str1 == NULL)
7462 return NULL;
7463 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007464 if (str2 == NULL) {
7465 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468
7469 result = replace(self, str1, str2, maxcount);
7470
7471 Py_DECREF(str1);
7472 Py_DECREF(str2);
7473 return result;
7474}
7475
7476static
7477PyObject *unicode_repr(PyObject *unicode)
7478{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007479 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007480 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007481 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7482 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7483
7484 /* XXX(nnorwitz): rather than over-allocating, it would be
7485 better to choose a different scheme. Perhaps scan the
7486 first N-chars of the string and allocate based on that size.
7487 */
7488 /* Initial allocation is based on the longest-possible unichr
7489 escape.
7490
7491 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7492 unichr, so in this case it's the longest unichr escape. In
7493 narrow (UTF-16) builds this is five chars per source unichr
7494 since there are two unichrs in the surrogate pair, so in narrow
7495 (UTF-16) builds it's not the longest unichr escape.
7496
7497 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7498 so in the narrow (UTF-16) build case it's the longest unichr
7499 escape.
7500 */
7501
Walter Dörwald1ab83302007-05-18 17:15:44 +00007502 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007503 2 /* quotes */
7504#ifdef Py_UNICODE_WIDE
7505 + 10*size
7506#else
7507 + 6*size
7508#endif
7509 + 1);
7510 if (repr == NULL)
7511 return NULL;
7512
Walter Dörwald1ab83302007-05-18 17:15:44 +00007513 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007514
7515 /* Add quote */
7516 *p++ = (findchar(s, size, '\'') &&
7517 !findchar(s, size, '"')) ? '"' : '\'';
7518 while (size-- > 0) {
7519 Py_UNICODE ch = *s++;
7520
7521 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007522 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007523 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007524 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007525 continue;
7526 }
7527
7528#ifdef Py_UNICODE_WIDE
7529 /* Map 21-bit characters to '\U00xxxxxx' */
7530 else if (ch >= 0x10000) {
7531 *p++ = '\\';
7532 *p++ = 'U';
7533 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7534 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7535 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7536 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7537 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7538 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7539 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7540 *p++ = hexdigits[ch & 0x0000000F];
7541 continue;
7542 }
7543#else
7544 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7545 else if (ch >= 0xD800 && ch < 0xDC00) {
7546 Py_UNICODE ch2;
7547 Py_UCS4 ucs;
7548
7549 ch2 = *s++;
7550 size--;
7551 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7552 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7553 *p++ = '\\';
7554 *p++ = 'U';
7555 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7556 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7557 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7558 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7559 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7560 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7561 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7562 *p++ = hexdigits[ucs & 0x0000000F];
7563 continue;
7564 }
7565 /* Fall through: isolated surrogates are copied as-is */
7566 s--;
7567 size++;
7568 }
7569#endif
7570
7571 /* Map 16-bit characters to '\uxxxx' */
7572 if (ch >= 256) {
7573 *p++ = '\\';
7574 *p++ = 'u';
7575 *p++ = hexdigits[(ch >> 12) & 0x000F];
7576 *p++ = hexdigits[(ch >> 8) & 0x000F];
7577 *p++ = hexdigits[(ch >> 4) & 0x000F];
7578 *p++ = hexdigits[ch & 0x000F];
7579 }
7580
7581 /* Map special whitespace to '\t', \n', '\r' */
7582 else if (ch == '\t') {
7583 *p++ = '\\';
7584 *p++ = 't';
7585 }
7586 else if (ch == '\n') {
7587 *p++ = '\\';
7588 *p++ = 'n';
7589 }
7590 else if (ch == '\r') {
7591 *p++ = '\\';
7592 *p++ = 'r';
7593 }
7594
7595 /* Map non-printable US ASCII to '\xhh' */
7596 else if (ch < ' ' || ch >= 0x7F) {
7597 *p++ = '\\';
7598 *p++ = 'x';
7599 *p++ = hexdigits[(ch >> 4) & 0x000F];
7600 *p++ = hexdigits[ch & 0x000F];
7601 }
7602
7603 /* Copy everything else as-is */
7604 else
7605 *p++ = (char) ch;
7606 }
7607 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007608 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007609
7610 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007611 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007612 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613}
7614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616"S.rfind(sub [,start [,end]]) -> int\n\
7617\n\
7618Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007619such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620arguments start and end are interpreted as in slice notation.\n\
7621\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007622Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623
7624static PyObject *
7625unicode_rfind(PyUnicodeObject *self, PyObject *args)
7626{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007627 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007628 Py_ssize_t start;
7629 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007630 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
Christian Heimes9cd17752007-11-18 19:35:23 +00007632 if (!_ParseTupleFinds(args, &substring, &start, &end))
7633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634
Thomas Wouters477c8d52006-05-27 19:21:47 +00007635 result = stringlib_rfind_slice(
7636 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7637 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7638 start, end
7639 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
7641 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007642
Christian Heimes217cfd12007-12-02 14:31:20 +00007643 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644}
7645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007646PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647"S.rindex(sub [,start [,end]]) -> int\n\
7648\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
7651static PyObject *
7652unicode_rindex(PyUnicodeObject *self, PyObject *args)
7653{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007654 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007655 Py_ssize_t start;
7656 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007657 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658
Christian Heimes9cd17752007-11-18 19:35:23 +00007659 if (!_ParseTupleFinds(args, &substring, &start, &end))
7660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661
Thomas Wouters477c8d52006-05-27 19:21:47 +00007662 result = stringlib_rfind_slice(
7663 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7664 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7665 start, end
7666 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
7668 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007669
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670 if (result < 0) {
7671 PyErr_SetString(PyExc_ValueError, "substring not found");
7672 return NULL;
7673 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007674 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675}
7676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007677PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007678"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679\n\
7680Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007681done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682
7683static PyObject *
7684unicode_rjust(PyUnicodeObject *self, PyObject *args)
7685{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007686 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007687 Py_UNICODE fillchar = ' ';
7688
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007689 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 return NULL;
7691
Tim Peters7a29bd52001-09-12 03:03:31 +00007692 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 Py_INCREF(self);
7694 return (PyObject*) self;
7695 }
7696
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007697 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698}
7699
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700PyObject *PyUnicode_Split(PyObject *s,
7701 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007702 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703{
7704 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007705
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 s = PyUnicode_FromObject(s);
7707 if (s == NULL)
7708 return NULL;
7709 if (sep != NULL) {
7710 sep = PyUnicode_FromObject(sep);
7711 if (sep == NULL) {
7712 Py_DECREF(s);
7713 return NULL;
7714 }
7715 }
7716
7717 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7718
7719 Py_DECREF(s);
7720 Py_XDECREF(sep);
7721 return result;
7722}
7723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007724PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725"S.split([sep [,maxsplit]]) -> list of strings\n\
7726\n\
7727Return a list of the words in S, using sep as the\n\
7728delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007729splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007730any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
7732static PyObject*
7733unicode_split(PyUnicodeObject *self, PyObject *args)
7734{
7735 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007736 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
Martin v. Löwis18e16552006-02-15 17:27:45 +00007738 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 return NULL;
7740
7741 if (substring == Py_None)
7742 return split(self, NULL, maxcount);
7743 else if (PyUnicode_Check(substring))
7744 return split(self, (PyUnicodeObject *)substring, maxcount);
7745 else
7746 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7747}
7748
Thomas Wouters477c8d52006-05-27 19:21:47 +00007749PyObject *
7750PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7751{
7752 PyObject* str_obj;
7753 PyObject* sep_obj;
7754 PyObject* out;
7755
7756 str_obj = PyUnicode_FromObject(str_in);
7757 if (!str_obj)
7758 return NULL;
7759 sep_obj = PyUnicode_FromObject(sep_in);
7760 if (!sep_obj) {
7761 Py_DECREF(str_obj);
7762 return NULL;
7763 }
7764
7765 out = stringlib_partition(
7766 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7767 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7768 );
7769
7770 Py_DECREF(sep_obj);
7771 Py_DECREF(str_obj);
7772
7773 return out;
7774}
7775
7776
7777PyObject *
7778PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7779{
7780 PyObject* str_obj;
7781 PyObject* sep_obj;
7782 PyObject* out;
7783
7784 str_obj = PyUnicode_FromObject(str_in);
7785 if (!str_obj)
7786 return NULL;
7787 sep_obj = PyUnicode_FromObject(sep_in);
7788 if (!sep_obj) {
7789 Py_DECREF(str_obj);
7790 return NULL;
7791 }
7792
7793 out = stringlib_rpartition(
7794 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7795 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7796 );
7797
7798 Py_DECREF(sep_obj);
7799 Py_DECREF(str_obj);
7800
7801 return out;
7802}
7803
7804PyDoc_STRVAR(partition__doc__,
7805"S.partition(sep) -> (head, sep, tail)\n\
7806\n\
7807Searches for the separator sep in S, and returns the part before it,\n\
7808the separator itself, and the part after it. If the separator is not\n\
7809found, returns S and two empty strings.");
7810
7811static PyObject*
7812unicode_partition(PyUnicodeObject *self, PyObject *separator)
7813{
7814 return PyUnicode_Partition((PyObject *)self, separator);
7815}
7816
7817PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007818"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007819\n\
7820Searches for the separator sep in S, starting at the end of S, and returns\n\
7821the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007822separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007823
7824static PyObject*
7825unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7826{
7827 return PyUnicode_RPartition((PyObject *)self, separator);
7828}
7829
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007830PyObject *PyUnicode_RSplit(PyObject *s,
7831 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007832 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007833{
7834 PyObject *result;
7835
7836 s = PyUnicode_FromObject(s);
7837 if (s == NULL)
7838 return NULL;
7839 if (sep != NULL) {
7840 sep = PyUnicode_FromObject(sep);
7841 if (sep == NULL) {
7842 Py_DECREF(s);
7843 return NULL;
7844 }
7845 }
7846
7847 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7848
7849 Py_DECREF(s);
7850 Py_XDECREF(sep);
7851 return result;
7852}
7853
7854PyDoc_STRVAR(rsplit__doc__,
7855"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7856\n\
7857Return a list of the words in S, using sep as the\n\
7858delimiter string, starting at the end of the string and\n\
7859working to the front. If maxsplit is given, at most maxsplit\n\
7860splits are done. If sep is not specified, any whitespace string\n\
7861is a separator.");
7862
7863static PyObject*
7864unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7865{
7866 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007867 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007868
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007870 return NULL;
7871
7872 if (substring == Py_None)
7873 return rsplit(self, NULL, maxcount);
7874 else if (PyUnicode_Check(substring))
7875 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7876 else
7877 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7878}
7879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007880PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007881"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882\n\
7883Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007884Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007885is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886
7887static PyObject*
7888unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7889{
Guido van Rossum86662912000-04-11 15:38:46 +00007890 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891
Guido van Rossum86662912000-04-11 15:38:46 +00007892 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893 return NULL;
7894
Guido van Rossum86662912000-04-11 15:38:46 +00007895 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896}
7897
7898static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007899PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900{
Walter Dörwald346737f2007-05-31 10:44:43 +00007901 if (PyUnicode_CheckExact(self)) {
7902 Py_INCREF(self);
7903 return self;
7904 } else
7905 /* Subtype -- return genuine unicode string with the same value. */
7906 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7907 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908}
7909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007910PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911"S.swapcase() -> unicode\n\
7912\n\
7913Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007914and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915
7916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007917unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 return fixup(self, fixswapcase);
7920}
7921
Georg Brandlceee0772007-11-27 23:48:05 +00007922PyDoc_STRVAR(maketrans__doc__,
7923"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
7924\n\
7925Return a translation table usable for str.translate().\n\
7926If there is only one argument, it must be a dictionary mapping Unicode\n\
7927ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
7928Character keys will then be converted to ordinals.\n\
7929If there are two arguments, they must be strings of equal length, and\n\
7930in the resulting dictionary, each character in x will be mapped to the\n\
7931character at the same position in y. If there is a third argument, it\n\
7932must be a string, whose characters will be mapped to None in the result.");
7933
7934static PyObject*
7935unicode_maketrans(PyUnicodeObject *null, PyObject *args)
7936{
7937 PyObject *x, *y = NULL, *z = NULL;
7938 PyObject *new = NULL, *key, *value;
7939 Py_ssize_t i = 0;
7940 int res;
7941
7942 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
7943 return NULL;
7944 new = PyDict_New();
7945 if (!new)
7946 return NULL;
7947 if (y != NULL) {
7948 /* x must be a string too, of equal length */
7949 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
7950 if (!PyUnicode_Check(x)) {
7951 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
7952 "be a string if there is a second argument");
7953 goto err;
7954 }
7955 if (PyUnicode_GET_SIZE(x) != ylen) {
7956 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
7957 "arguments must have equal length");
7958 goto err;
7959 }
7960 /* create entries for translating chars in x to those in y */
7961 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007962 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
7963 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007964 if (!key || !value)
7965 goto err;
7966 res = PyDict_SetItem(new, key, value);
7967 Py_DECREF(key);
7968 Py_DECREF(value);
7969 if (res < 0)
7970 goto err;
7971 }
7972 /* create entries for deleting chars in z */
7973 if (z != NULL) {
7974 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00007975 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00007976 if (!key)
7977 goto err;
7978 res = PyDict_SetItem(new, key, Py_None);
7979 Py_DECREF(key);
7980 if (res < 0)
7981 goto err;
7982 }
7983 }
7984 } else {
7985 /* x must be a dict */
7986 if (!PyDict_Check(x)) {
7987 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
7988 "to maketrans it must be a dict");
7989 goto err;
7990 }
7991 /* copy entries into the new dict, converting string keys to int keys */
7992 while (PyDict_Next(x, &i, &key, &value)) {
7993 if (PyUnicode_Check(key)) {
7994 /* convert string keys to integer keys */
7995 PyObject *newkey;
7996 if (PyUnicode_GET_SIZE(key) != 1) {
7997 PyErr_SetString(PyExc_ValueError, "string keys in translate "
7998 "table must be of length 1");
7999 goto err;
8000 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008001 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008002 if (!newkey)
8003 goto err;
8004 res = PyDict_SetItem(new, newkey, value);
8005 Py_DECREF(newkey);
8006 if (res < 0)
8007 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008008 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008009 /* just keep integer keys */
8010 if (PyDict_SetItem(new, key, value) < 0)
8011 goto err;
8012 } else {
8013 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8014 "be strings or integers");
8015 goto err;
8016 }
8017 }
8018 }
8019 return new;
8020 err:
8021 Py_DECREF(new);
8022 return NULL;
8023}
8024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008025PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026"S.translate(table) -> unicode\n\
8027\n\
8028Return a copy of the string S, where all characters have been mapped\n\
8029through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008030Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
8031Unmapped characters are left untouched. Characters mapped to None\n\
8032are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033
8034static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008035unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036{
Georg Brandlceee0772007-11-27 23:48:05 +00008037 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038}
8039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008040PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041"S.upper() -> unicode\n\
8042\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008043Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044
8045static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008046unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 return fixup(self, fixupper);
8049}
8050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008051PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052"S.zfill(width) -> unicode\n\
8053\n\
8054Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008055of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056
8057static PyObject *
8058unicode_zfill(PyUnicodeObject *self, PyObject *args)
8059{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008060 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 PyUnicodeObject *u;
8062
Martin v. Löwis18e16552006-02-15 17:27:45 +00008063 Py_ssize_t width;
8064 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 return NULL;
8066
8067 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008068 if (PyUnicode_CheckExact(self)) {
8069 Py_INCREF(self);
8070 return (PyObject*) self;
8071 }
8072 else
8073 return PyUnicode_FromUnicode(
8074 PyUnicode_AS_UNICODE(self),
8075 PyUnicode_GET_SIZE(self)
8076 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 }
8078
8079 fill = width - self->length;
8080
8081 u = pad(self, fill, 0, '0');
8082
Walter Dörwald068325e2002-04-15 13:36:47 +00008083 if (u == NULL)
8084 return NULL;
8085
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 if (u->str[fill] == '+' || u->str[fill] == '-') {
8087 /* move sign to beginning of string */
8088 u->str[0] = u->str[fill];
8089 u->str[fill] = '0';
8090 }
8091
8092 return (PyObject*) u;
8093}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094
8095#if 0
8096static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008097unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098{
Christian Heimes2202f872008-02-06 14:31:34 +00008099 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100}
8101#endif
8102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008103PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008104"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008106Return True if S starts with the specified prefix, False otherwise.\n\
8107With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008108With optional end, stop comparing S at that position.\n\
8109prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110
8111static PyObject *
8112unicode_startswith(PyUnicodeObject *self,
8113 PyObject *args)
8114{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008115 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008117 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008118 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008119 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008121 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008122 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008124 if (PyTuple_Check(subobj)) {
8125 Py_ssize_t i;
8126 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8127 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8128 PyTuple_GET_ITEM(subobj, i));
8129 if (substring == NULL)
8130 return NULL;
8131 result = tailmatch(self, substring, start, end, -1);
8132 Py_DECREF(substring);
8133 if (result) {
8134 Py_RETURN_TRUE;
8135 }
8136 }
8137 /* nothing matched */
8138 Py_RETURN_FALSE;
8139 }
8140 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008142 return NULL;
8143 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008145 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146}
8147
8148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008149PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008150"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008152Return True if S ends with the specified suffix, False otherwise.\n\
8153With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008154With optional end, stop comparing S at that position.\n\
8155suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156
8157static PyObject *
8158unicode_endswith(PyUnicodeObject *self,
8159 PyObject *args)
8160{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008161 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008163 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008164 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008165 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008167 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8168 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008170 if (PyTuple_Check(subobj)) {
8171 Py_ssize_t i;
8172 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8173 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8174 PyTuple_GET_ITEM(subobj, i));
8175 if (substring == NULL)
8176 return NULL;
8177 result = tailmatch(self, substring, start, end, +1);
8178 Py_DECREF(substring);
8179 if (result) {
8180 Py_RETURN_TRUE;
8181 }
8182 }
8183 Py_RETURN_FALSE;
8184 }
8185 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008189 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008191 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192}
8193
Eric Smith8c663262007-08-25 02:26:07 +00008194#include "stringlib/string_format.h"
8195
8196PyDoc_STRVAR(format__doc__,
8197"S.format(*args, **kwargs) -> unicode\n\
8198\n\
8199");
8200
Eric Smith8c663262007-08-25 02:26:07 +00008201PyDoc_STRVAR(p_format__doc__,
8202"S.__format__(format_spec) -> unicode\n\
8203\n\
8204");
8205
8206static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008207unicode_getnewargs(PyUnicodeObject *v)
8208{
8209 return Py_BuildValue("(u#)", v->str, v->length);
8210}
8211
8212
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213static PyMethodDef unicode_methods[] = {
8214
8215 /* Order is according to common usage: often used methods should
8216 appear first, since lookup is done sequentially. */
8217
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008218 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8219 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8220 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008221 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008222 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8223 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8224 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8225 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8226 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8227 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8228 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008229 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008230 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8231 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8232 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008233 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008234 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8235 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8236 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008237 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008238 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008239 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008240 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008241 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8242 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8243 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8244 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8245 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8246 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8247 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8248 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8249 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8250 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8251 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8252 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8253 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8254 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008255 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008256 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008257 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8258 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008259 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8260 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008261 {"maketrans", (PyCFunction) unicode_maketrans,
8262 METH_VARARGS | METH_STATIC, maketrans__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008263#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008264 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265#endif
8266
8267#if 0
8268 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008269 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270#endif
8271
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008272 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 {NULL, NULL}
8274};
8275
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008276static PyObject *
8277unicode_mod(PyObject *v, PyObject *w)
8278{
8279 if (!PyUnicode_Check(v)) {
8280 Py_INCREF(Py_NotImplemented);
8281 return Py_NotImplemented;
8282 }
8283 return PyUnicode_Format(v, w);
8284}
8285
8286static PyNumberMethods unicode_as_number = {
8287 0, /*nb_add*/
8288 0, /*nb_subtract*/
8289 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008290 unicode_mod, /*nb_remainder*/
8291};
8292
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008294 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008295 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008296 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8297 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008298 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 0, /* sq_ass_item */
8300 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008301 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302};
8303
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008304static PyObject*
8305unicode_subscript(PyUnicodeObject* self, PyObject* item)
8306{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008307 if (PyIndex_Check(item)) {
8308 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008309 if (i == -1 && PyErr_Occurred())
8310 return NULL;
8311 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008312 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008313 return unicode_getitem(self, i);
8314 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008315 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008316 Py_UNICODE* source_buf;
8317 Py_UNICODE* result_buf;
8318 PyObject* result;
8319
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008320 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008321 &start, &stop, &step, &slicelength) < 0) {
8322 return NULL;
8323 }
8324
8325 if (slicelength <= 0) {
8326 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008327 } else if (start == 0 && step == 1 && slicelength == self->length &&
8328 PyUnicode_CheckExact(self)) {
8329 Py_INCREF(self);
8330 return (PyObject *)self;
8331 } else if (step == 1) {
8332 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008333 } else {
8334 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008335 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8336 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008337
8338 if (result_buf == NULL)
8339 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008340
8341 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8342 result_buf[i] = source_buf[cur];
8343 }
Tim Petersced69f82003-09-16 20:30:58 +00008344
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008345 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008346 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008347 return result;
8348 }
8349 } else {
8350 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8351 return NULL;
8352 }
8353}
8354
8355static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008356 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008357 (binaryfunc)unicode_subscript, /* mp_subscript */
8358 (objobjargproc)0, /* mp_ass_subscript */
8359};
8360
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362/* Helpers for PyUnicode_Format() */
8363
8364static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008365getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008367 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368 if (argidx < arglen) {
8369 (*p_argidx)++;
8370 if (arglen < 0)
8371 return args;
8372 else
8373 return PyTuple_GetItem(args, argidx);
8374 }
8375 PyErr_SetString(PyExc_TypeError,
8376 "not enough arguments for format string");
8377 return NULL;
8378}
8379
Martin v. Löwis18e16552006-02-15 17:27:45 +00008380static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008381strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008383 register Py_ssize_t i;
8384 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 for (i = len - 1; i >= 0; i--)
8386 buffer[i] = (Py_UNICODE) charbuffer[i];
8387
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 return len;
8389}
8390
Neal Norwitzfc76d632006-01-10 06:03:13 +00008391static int
8392doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8393{
Tim Peters15231542006-02-16 01:08:01 +00008394 Py_ssize_t result;
8395
Neal Norwitzfc76d632006-01-10 06:03:13 +00008396 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008397 result = strtounicode(buffer, (char *)buffer);
8398 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008399}
8400
Christian Heimes3fd13992008-03-21 01:05:49 +00008401#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008402static int
8403longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8404{
Tim Peters15231542006-02-16 01:08:01 +00008405 Py_ssize_t result;
8406
Neal Norwitzfc76d632006-01-10 06:03:13 +00008407 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008408 result = strtounicode(buffer, (char *)buffer);
8409 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008410}
Christian Heimes3fd13992008-03-21 01:05:49 +00008411#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008412
Guido van Rossum078151d2002-08-11 04:24:12 +00008413/* XXX To save some code duplication, formatfloat/long/int could have been
8414 shared with stringobject.c, converting from 8-bit to Unicode after the
8415 formatting is done. */
8416
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417static int
8418formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008419 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 int flags,
8421 int prec,
8422 int type,
8423 PyObject *v)
8424{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008425 /* fmt = '%#.' + `prec` + `type`
8426 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 char fmt[20];
8428 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008429
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 x = PyFloat_AsDouble(v);
8431 if (x == -1.0 && PyErr_Occurred())
8432 return -1;
8433 if (prec < 0)
8434 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8436 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008437 /* Worst case length calc to ensure no buffer overrun:
8438
8439 'g' formats:
8440 fmt = %#.<prec>g
8441 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8442 for any double rep.)
8443 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8444
8445 'f' formats:
8446 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8447 len = 1 + 50 + 1 + prec = 52 + prec
8448
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008449 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008450 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008451
8452 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008453 if (((type == 'g' || type == 'G') &&
8454 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008455 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008456 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008457 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008458 return -1;
8459 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008460 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8461 (flags&F_ALT) ? "#" : "",
8462 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008463 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464}
8465
Tim Peters38fd5b62000-09-21 05:43:11 +00008466static PyObject*
8467formatlong(PyObject *val, int flags, int prec, int type)
8468{
8469 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008470 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008471 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008472 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008473
8474 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8475 if (!str)
8476 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008477 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008478 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008479 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008480}
8481
Christian Heimes3fd13992008-03-21 01:05:49 +00008482#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483static int
8484formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008485 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486 int flags,
8487 int prec,
8488 int type,
8489 PyObject *v)
8490{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008491 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008492 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8493 * + 1 + 1
8494 * = 24
8495 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008496 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008497 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 long x;
8499
Christian Heimes217cfd12007-12-02 14:31:20 +00008500 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008502 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008503 if (x < 0 && type == 'u') {
8504 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008505 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008506 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8507 sign = "-";
8508 else
8509 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008511 prec = 1;
8512
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008513 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8514 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008515 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008516 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008517 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008518 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008519 return -1;
8520 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008521
8522 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008523 (type == 'x' || type == 'X' || type == 'o')) {
8524 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008525 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008526 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008527 * - when 0 is being converted, the C standard leaves off
8528 * the '0x' or '0X', which is inconsistent with other
8529 * %#x/%#X conversions and inconsistent with Python's
8530 * hex() function
8531 * - there are platforms that violate the standard and
8532 * convert 0 with the '0x' or '0X'
8533 * (Metrowerks, Compaq Tru64)
8534 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008535 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008536 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008537 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008538 * We can achieve the desired consistency by inserting our
8539 * own '0x' or '0X' prefix, and substituting %x/%X in place
8540 * of %#x/%#X.
8541 *
8542 * Note that this is the same approach as used in
8543 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008544 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008545 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8546 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008547 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008548 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008549 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8550 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008551 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008552 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008553 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008554 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008555 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008556 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557}
Christian Heimes3fd13992008-03-21 01:05:49 +00008558#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559
8560static int
8561formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008562 size_t buflen,
8563 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008565 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008566 if (PyUnicode_Check(v)) {
8567 if (PyUnicode_GET_SIZE(v) != 1)
8568 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 else {
8572 /* Integer input truncated to a character */
8573 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008574 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008576 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008577#ifdef Py_UNICODE_WIDE
8578 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008579 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008580 "%c arg not in range(0x110000) "
8581 "(wide Python build)");
8582 return -1;
8583 }
8584#else
8585 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008586 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008587 "%c arg not in range(0x10000) "
8588 "(narrow Python build)");
8589 return -1;
8590 }
8591#endif
8592 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 }
8594 buf[1] = '\0';
8595 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008596
8597 onError:
8598 PyErr_SetString(PyExc_TypeError,
8599 "%c requires int or char");
8600 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601}
8602
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008603/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8604
8605 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8606 chars are formatted. XXX This is a magic number. Each formatting
8607 routine does bounds checking to ensure no overflow, but a better
8608 solution may be to malloc a buffer of appropriate size for each
8609 format. For now, the current solution is sufficient.
8610*/
8611#define FORMATBUFLEN (size_t)120
8612
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613PyObject *PyUnicode_Format(PyObject *format,
8614 PyObject *args)
8615{
8616 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008617 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 int args_owned = 0;
8619 PyUnicodeObject *result = NULL;
8620 PyObject *dict = NULL;
8621 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008622
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 if (format == NULL || args == NULL) {
8624 PyErr_BadInternalCall();
8625 return NULL;
8626 }
8627 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008628 if (uformat == NULL)
8629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 fmt = PyUnicode_AS_UNICODE(uformat);
8631 fmtcnt = PyUnicode_GET_SIZE(uformat);
8632
8633 reslen = rescnt = fmtcnt + 100;
8634 result = _PyUnicode_New(reslen);
8635 if (result == NULL)
8636 goto onError;
8637 res = PyUnicode_AS_UNICODE(result);
8638
8639 if (PyTuple_Check(args)) {
8640 arglen = PyTuple_Size(args);
8641 argidx = 0;
8642 }
8643 else {
8644 arglen = -1;
8645 argidx = -2;
8646 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008647 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008648 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 dict = args;
8650
8651 while (--fmtcnt >= 0) {
8652 if (*fmt != '%') {
8653 if (--rescnt < 0) {
8654 rescnt = fmtcnt + 100;
8655 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008656 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008657 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8659 --rescnt;
8660 }
8661 *res++ = *fmt++;
8662 }
8663 else {
8664 /* Got a format specifier */
8665 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008666 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 Py_UNICODE c = '\0';
8669 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008670 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 PyObject *v = NULL;
8672 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008673 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008675 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008676 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677
8678 fmt++;
8679 if (*fmt == '(') {
8680 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008681 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 PyObject *key;
8683 int pcount = 1;
8684
8685 if (dict == NULL) {
8686 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008687 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 goto onError;
8689 }
8690 ++fmt;
8691 --fmtcnt;
8692 keystart = fmt;
8693 /* Skip over balanced parentheses */
8694 while (pcount > 0 && --fmtcnt >= 0) {
8695 if (*fmt == ')')
8696 --pcount;
8697 else if (*fmt == '(')
8698 ++pcount;
8699 fmt++;
8700 }
8701 keylen = fmt - keystart - 1;
8702 if (fmtcnt < 0 || pcount > 0) {
8703 PyErr_SetString(PyExc_ValueError,
8704 "incomplete format key");
8705 goto onError;
8706 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008707#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008708 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 then looked up since Python uses strings to hold
8710 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008711 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 key = PyUnicode_EncodeUTF8(keystart,
8713 keylen,
8714 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008715#else
8716 key = PyUnicode_FromUnicode(keystart, keylen);
8717#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718 if (key == NULL)
8719 goto onError;
8720 if (args_owned) {
8721 Py_DECREF(args);
8722 args_owned = 0;
8723 }
8724 args = PyObject_GetItem(dict, key);
8725 Py_DECREF(key);
8726 if (args == NULL) {
8727 goto onError;
8728 }
8729 args_owned = 1;
8730 arglen = -1;
8731 argidx = -2;
8732 }
8733 while (--fmtcnt >= 0) {
8734 switch (c = *fmt++) {
8735 case '-': flags |= F_LJUST; continue;
8736 case '+': flags |= F_SIGN; continue;
8737 case ' ': flags |= F_BLANK; continue;
8738 case '#': flags |= F_ALT; continue;
8739 case '0': flags |= F_ZERO; continue;
8740 }
8741 break;
8742 }
8743 if (c == '*') {
8744 v = getnextarg(args, arglen, &argidx);
8745 if (v == NULL)
8746 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008747 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 PyErr_SetString(PyExc_TypeError,
8749 "* wants int");
8750 goto onError;
8751 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008752 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008753 if (width == -1 && PyErr_Occurred())
8754 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755 if (width < 0) {
8756 flags |= F_LJUST;
8757 width = -width;
8758 }
8759 if (--fmtcnt >= 0)
8760 c = *fmt++;
8761 }
8762 else if (c >= '0' && c <= '9') {
8763 width = c - '0';
8764 while (--fmtcnt >= 0) {
8765 c = *fmt++;
8766 if (c < '0' || c > '9')
8767 break;
8768 if ((width*10) / 10 != width) {
8769 PyErr_SetString(PyExc_ValueError,
8770 "width too big");
8771 goto onError;
8772 }
8773 width = width*10 + (c - '0');
8774 }
8775 }
8776 if (c == '.') {
8777 prec = 0;
8778 if (--fmtcnt >= 0)
8779 c = *fmt++;
8780 if (c == '*') {
8781 v = getnextarg(args, arglen, &argidx);
8782 if (v == NULL)
8783 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008784 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 PyErr_SetString(PyExc_TypeError,
8786 "* wants int");
8787 goto onError;
8788 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008789 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008790 if (prec == -1 && PyErr_Occurred())
8791 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 if (prec < 0)
8793 prec = 0;
8794 if (--fmtcnt >= 0)
8795 c = *fmt++;
8796 }
8797 else if (c >= '0' && c <= '9') {
8798 prec = c - '0';
8799 while (--fmtcnt >= 0) {
8800 c = Py_CHARMASK(*fmt++);
8801 if (c < '0' || c > '9')
8802 break;
8803 if ((prec*10) / 10 != prec) {
8804 PyErr_SetString(PyExc_ValueError,
8805 "prec too big");
8806 goto onError;
8807 }
8808 prec = prec*10 + (c - '0');
8809 }
8810 }
8811 } /* prec */
8812 if (fmtcnt >= 0) {
8813 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814 if (--fmtcnt >= 0)
8815 c = *fmt++;
8816 }
8817 }
8818 if (fmtcnt < 0) {
8819 PyErr_SetString(PyExc_ValueError,
8820 "incomplete format");
8821 goto onError;
8822 }
8823 if (c != '%') {
8824 v = getnextarg(args, arglen, &argidx);
8825 if (v == NULL)
8826 goto onError;
8827 }
8828 sign = 0;
8829 fill = ' ';
8830 switch (c) {
8831
8832 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008833 pbuf = formatbuf;
8834 /* presume that buffer length is at least 1 */
8835 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 len = 1;
8837 break;
8838
8839 case 's':
8840 case 'r':
8841 if (PyUnicode_Check(v) && c == 's') {
8842 temp = v;
8843 Py_INCREF(temp);
8844 }
8845 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008847 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 else
8849 temp = PyObject_Repr(v);
8850 if (temp == NULL)
8851 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008852 if (PyUnicode_Check(temp))
8853 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008854 else {
8855 Py_DECREF(temp);
8856 PyErr_SetString(PyExc_TypeError,
8857 "%s argument has non-string str()");
8858 goto onError;
8859 }
8860 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008861 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 len = PyUnicode_GET_SIZE(temp);
8863 if (prec >= 0 && len > prec)
8864 len = prec;
8865 break;
8866
8867 case 'i':
8868 case 'd':
8869 case 'u':
8870 case 'o':
8871 case 'x':
8872 case 'X':
8873 if (c == 'i')
8874 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00008875 isnumok = 0;
8876 if (PyNumber_Check(v)) {
8877 PyObject *iobj=NULL;
8878
8879 if (PyLong_Check(v)) {
8880 iobj = v;
8881 Py_INCREF(iobj);
8882 }
8883 else {
8884 iobj = PyNumber_Long(v);
8885 }
8886 if (iobj!=NULL) {
8887 if (PyLong_Check(iobj)) {
8888 isnumok = 1;
8889 temp = formatlong(iobj, flags, prec, c);
8890 Py_DECREF(iobj);
8891 if (!temp)
8892 goto onError;
8893 pbuf = PyUnicode_AS_UNICODE(temp);
8894 len = PyUnicode_GET_SIZE(temp);
8895 sign = 1;
8896 }
8897 else {
8898 Py_DECREF(iobj);
8899 }
8900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901 }
Christian Heimesa612dc02008-02-24 13:08:18 +00008902 if (!isnumok) {
8903 PyErr_Format(PyExc_TypeError,
8904 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00008905 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008906 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008907 }
8908 if (flags & F_ZERO)
8909 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910 break;
8911
8912 case 'e':
8913 case 'E':
8914 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008915 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 case 'g':
8917 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008918 if (c == 'F')
8919 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008920 pbuf = formatbuf;
8921 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8922 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 if (len < 0)
8924 goto onError;
8925 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008926 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927 fill = '0';
8928 break;
8929
8930 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008931 pbuf = formatbuf;
8932 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 if (len < 0)
8934 goto onError;
8935 break;
8936
8937 default:
8938 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008939 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008940 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008941 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008942 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008943 (Py_ssize_t)(fmt - 1 -
8944 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 goto onError;
8946 }
8947 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008948 if (*pbuf == '-' || *pbuf == '+') {
8949 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950 len--;
8951 }
8952 else if (flags & F_SIGN)
8953 sign = '+';
8954 else if (flags & F_BLANK)
8955 sign = ' ';
8956 else
8957 sign = 0;
8958 }
8959 if (width < len)
8960 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008961 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 reslen -= rescnt;
8963 rescnt = width + fmtcnt + 100;
8964 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008965 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008966 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008967 PyErr_NoMemory();
8968 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008969 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008970 if (_PyUnicode_Resize(&result, reslen) < 0) {
8971 Py_XDECREF(temp);
8972 goto onError;
8973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 res = PyUnicode_AS_UNICODE(result)
8975 + reslen - rescnt;
8976 }
8977 if (sign) {
8978 if (fill != ' ')
8979 *res++ = sign;
8980 rescnt--;
8981 if (width > len)
8982 width--;
8983 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008984 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008985 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008986 assert(pbuf[1] == c);
8987 if (fill != ' ') {
8988 *res++ = *pbuf++;
8989 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008990 }
Tim Petersfff53252001-04-12 18:38:48 +00008991 rescnt -= 2;
8992 width -= 2;
8993 if (width < 0)
8994 width = 0;
8995 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997 if (width > len && !(flags & F_LJUST)) {
8998 do {
8999 --rescnt;
9000 *res++ = fill;
9001 } while (--width > len);
9002 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009003 if (fill == ' ') {
9004 if (sign)
9005 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009006 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009007 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009008 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009009 *res++ = *pbuf++;
9010 *res++ = *pbuf++;
9011 }
9012 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009013 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 res += len;
9015 rescnt -= len;
9016 while (--width >= len) {
9017 --rescnt;
9018 *res++ = ' ';
9019 }
9020 if (dict && (argidx < arglen) && c != '%') {
9021 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009022 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009023 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024 goto onError;
9025 }
9026 Py_XDECREF(temp);
9027 } /* '%' */
9028 } /* until end */
9029 if (argidx < arglen && !dict) {
9030 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009031 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 goto onError;
9033 }
9034
Thomas Woutersa96affe2006-03-12 00:29:36 +00009035 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9036 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 if (args_owned) {
9038 Py_DECREF(args);
9039 }
9040 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 return (PyObject *)result;
9042
9043 onError:
9044 Py_XDECREF(result);
9045 Py_DECREF(uformat);
9046 if (args_owned) {
9047 Py_DECREF(args);
9048 }
9049 return NULL;
9050}
9051
Jeremy Hylton938ace62002-07-17 16:30:39 +00009052static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009053unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9054
Tim Peters6d6c1a32001-08-02 04:15:00 +00009055static PyObject *
9056unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9057{
9058 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009059 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009060 char *encoding = NULL;
9061 char *errors = NULL;
9062
Guido van Rossume023fe02001-08-30 03:12:59 +00009063 if (type != &PyUnicode_Type)
9064 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009065 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009066 kwlist, &x, &encoding, &errors))
9067 return NULL;
9068 if (x == NULL)
9069 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009070 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009071 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009072 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009073 return PyUnicode_FromEncodedObject(x, encoding, errors);
9074}
9075
Guido van Rossume023fe02001-08-30 03:12:59 +00009076static PyObject *
9077unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9078{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009079 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009080 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009081
9082 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9083 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9084 if (tmp == NULL)
9085 return NULL;
9086 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009087 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009088 if (pnew == NULL) {
9089 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009090 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009091 }
Christian Heimesb186d002008-03-18 15:15:01 +00009092 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009093 if (pnew->str == NULL) {
9094 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009095 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009096 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009097 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009098 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009099 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9100 pnew->length = n;
9101 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009102 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009103 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009104}
9105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009106PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00009107"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009108\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009109Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009110encoding defaults to the current default string encoding.\n\
9111errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009112
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009113static PyObject *unicode_iter(PyObject *seq);
9114
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009116 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009117 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118 sizeof(PyUnicodeObject), /* tp_size */
9119 0, /* tp_itemsize */
9120 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009121 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009123 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009125 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009126 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009127 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009129 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 (hashfunc) unicode_hash, /* tp_hash*/
9131 0, /* tp_call*/
9132 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009133 PyObject_GenericGetAttr, /* tp_getattro */
9134 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009135 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009136 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9137 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009138 unicode_doc, /* tp_doc */
9139 0, /* tp_traverse */
9140 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009141 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009142 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009143 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009144 0, /* tp_iternext */
9145 unicode_methods, /* tp_methods */
9146 0, /* tp_members */
9147 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009148 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009149 0, /* tp_dict */
9150 0, /* tp_descr_get */
9151 0, /* tp_descr_set */
9152 0, /* tp_dictoffset */
9153 0, /* tp_init */
9154 0, /* tp_alloc */
9155 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009156 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157};
9158
9159/* Initialize the Unicode implementation */
9160
Thomas Wouters78890102000-07-22 19:25:51 +00009161void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009163 int i;
9164
Thomas Wouters477c8d52006-05-27 19:21:47 +00009165 /* XXX - move this array to unicodectype.c ? */
9166 Py_UNICODE linebreak[] = {
9167 0x000A, /* LINE FEED */
9168 0x000D, /* CARRIAGE RETURN */
9169 0x001C, /* FILE SEPARATOR */
9170 0x001D, /* GROUP SEPARATOR */
9171 0x001E, /* RECORD SEPARATOR */
9172 0x0085, /* NEXT LINE */
9173 0x2028, /* LINE SEPARATOR */
9174 0x2029, /* PARAGRAPH SEPARATOR */
9175 };
9176
Fred Drakee4315f52000-05-09 19:53:39 +00009177 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009178 free_list = NULL;
9179 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009181 if (!unicode_empty)
9182 return;
9183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009184 for (i = 0; i < 256; i++)
9185 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009186 if (PyType_Ready(&PyUnicode_Type) < 0)
9187 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009188
9189 /* initialize the linebreak bloom filter */
9190 bloom_linebreak = make_bloom_mask(
9191 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9192 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009193
9194 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195}
9196
9197/* Finalize the Unicode implementation */
9198
Christian Heimesa156e092008-02-16 07:38:31 +00009199int
9200PyUnicode_ClearFreeList(void)
9201{
9202 int freelist_size = numfree;
9203 PyUnicodeObject *u;
9204
9205 for (u = free_list; u != NULL;) {
9206 PyUnicodeObject *v = u;
9207 u = *(PyUnicodeObject **)u;
9208 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009209 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009210 Py_XDECREF(v->defenc);
9211 PyObject_Del(v);
9212 numfree--;
9213 }
9214 free_list = NULL;
9215 assert(numfree == 0);
9216 return freelist_size;
9217}
9218
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219void
Thomas Wouters78890102000-07-22 19:25:51 +00009220_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009222 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009224 Py_XDECREF(unicode_empty);
9225 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009227 for (i = 0; i < 256; i++) {
9228 if (unicode_latin1[i]) {
9229 Py_DECREF(unicode_latin1[i]);
9230 unicode_latin1[i] = NULL;
9231 }
9232 }
Christian Heimesa156e092008-02-16 07:38:31 +00009233 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009235
Walter Dörwald16807132007-05-25 13:52:07 +00009236void
9237PyUnicode_InternInPlace(PyObject **p)
9238{
9239 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9240 PyObject *t;
9241 if (s == NULL || !PyUnicode_Check(s))
9242 Py_FatalError(
9243 "PyUnicode_InternInPlace: unicode strings only please!");
9244 /* If it's a subclass, we don't really know what putting
9245 it in the interned dict might do. */
9246 if (!PyUnicode_CheckExact(s))
9247 return;
9248 if (PyUnicode_CHECK_INTERNED(s))
9249 return;
9250 if (interned == NULL) {
9251 interned = PyDict_New();
9252 if (interned == NULL) {
9253 PyErr_Clear(); /* Don't leave an exception */
9254 return;
9255 }
9256 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009257 /* It might be that the GetItem call fails even
9258 though the key is present in the dictionary,
9259 namely when this happens during a stack overflow. */
9260 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009261 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009262 Py_END_ALLOW_RECURSION
9263
Walter Dörwald16807132007-05-25 13:52:07 +00009264 if (t) {
9265 Py_INCREF(t);
9266 Py_DECREF(*p);
9267 *p = t;
9268 return;
9269 }
9270
Martin v. Löwis5b222132007-06-10 09:51:05 +00009271 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009272 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9273 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009274 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009275 return;
9276 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009277 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009278 /* The two references in interned are not counted by refcnt.
9279 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009280 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009281 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9282}
9283
9284void
9285PyUnicode_InternImmortal(PyObject **p)
9286{
9287 PyUnicode_InternInPlace(p);
9288 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9289 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9290 Py_INCREF(*p);
9291 }
9292}
9293
9294PyObject *
9295PyUnicode_InternFromString(const char *cp)
9296{
9297 PyObject *s = PyUnicode_FromString(cp);
9298 if (s == NULL)
9299 return NULL;
9300 PyUnicode_InternInPlace(&s);
9301 return s;
9302}
9303
9304void _Py_ReleaseInternedUnicodeStrings(void)
9305{
9306 PyObject *keys;
9307 PyUnicodeObject *s;
9308 Py_ssize_t i, n;
9309 Py_ssize_t immortal_size = 0, mortal_size = 0;
9310
9311 if (interned == NULL || !PyDict_Check(interned))
9312 return;
9313 keys = PyDict_Keys(interned);
9314 if (keys == NULL || !PyList_Check(keys)) {
9315 PyErr_Clear();
9316 return;
9317 }
9318
9319 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9320 detector, interned unicode strings are not forcibly deallocated;
9321 rather, we give them their stolen references back, and then clear
9322 and DECREF the interned dict. */
9323
9324 n = PyList_GET_SIZE(keys);
9325 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9326 n);
9327 for (i = 0; i < n; i++) {
9328 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9329 switch (s->state) {
9330 case SSTATE_NOT_INTERNED:
9331 /* XXX Shouldn't happen */
9332 break;
9333 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009334 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009335 immortal_size += s->length;
9336 break;
9337 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009338 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009339 mortal_size += s->length;
9340 break;
9341 default:
9342 Py_FatalError("Inconsistent interned string state.");
9343 }
9344 s->state = SSTATE_NOT_INTERNED;
9345 }
9346 fprintf(stderr, "total size of all interned strings: "
9347 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9348 "mortal/immortal\n", mortal_size, immortal_size);
9349 Py_DECREF(keys);
9350 PyDict_Clear(interned);
9351 Py_DECREF(interned);
9352 interned = NULL;
9353}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009354
9355
9356/********************* Unicode Iterator **************************/
9357
9358typedef struct {
9359 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009360 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009361 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9362} unicodeiterobject;
9363
9364static void
9365unicodeiter_dealloc(unicodeiterobject *it)
9366{
9367 _PyObject_GC_UNTRACK(it);
9368 Py_XDECREF(it->it_seq);
9369 PyObject_GC_Del(it);
9370}
9371
9372static int
9373unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9374{
9375 Py_VISIT(it->it_seq);
9376 return 0;
9377}
9378
9379static PyObject *
9380unicodeiter_next(unicodeiterobject *it)
9381{
9382 PyUnicodeObject *seq;
9383 PyObject *item;
9384
9385 assert(it != NULL);
9386 seq = it->it_seq;
9387 if (seq == NULL)
9388 return NULL;
9389 assert(PyUnicode_Check(seq));
9390
9391 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009392 item = PyUnicode_FromUnicode(
9393 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009394 if (item != NULL)
9395 ++it->it_index;
9396 return item;
9397 }
9398
9399 Py_DECREF(seq);
9400 it->it_seq = NULL;
9401 return NULL;
9402}
9403
9404static PyObject *
9405unicodeiter_len(unicodeiterobject *it)
9406{
9407 Py_ssize_t len = 0;
9408 if (it->it_seq)
9409 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009410 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009411}
9412
9413PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9414
9415static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009416 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9417 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009418 {NULL, NULL} /* sentinel */
9419};
9420
9421PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009422 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009423 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009424 sizeof(unicodeiterobject), /* tp_basicsize */
9425 0, /* tp_itemsize */
9426 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009427 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009428 0, /* tp_print */
9429 0, /* tp_getattr */
9430 0, /* tp_setattr */
9431 0, /* tp_compare */
9432 0, /* tp_repr */
9433 0, /* tp_as_number */
9434 0, /* tp_as_sequence */
9435 0, /* tp_as_mapping */
9436 0, /* tp_hash */
9437 0, /* tp_call */
9438 0, /* tp_str */
9439 PyObject_GenericGetAttr, /* tp_getattro */
9440 0, /* tp_setattro */
9441 0, /* tp_as_buffer */
9442 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9443 0, /* tp_doc */
9444 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9445 0, /* tp_clear */
9446 0, /* tp_richcompare */
9447 0, /* tp_weaklistoffset */
9448 PyObject_SelfIter, /* tp_iter */
9449 (iternextfunc)unicodeiter_next, /* tp_iternext */
9450 unicodeiter_methods, /* tp_methods */
9451 0,
9452};
9453
9454static PyObject *
9455unicode_iter(PyObject *seq)
9456{
9457 unicodeiterobject *it;
9458
9459 if (!PyUnicode_Check(seq)) {
9460 PyErr_BadInternalCall();
9461 return NULL;
9462 }
9463 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9464 if (it == NULL)
9465 return NULL;
9466 it->it_index = 0;
9467 Py_INCREF(seq);
9468 it->it_seq = (PyUnicodeObject *)seq;
9469 _PyObject_GC_TRACK(it);
9470 return (PyObject *)it;
9471}
9472
Martin v. Löwis5b222132007-06-10 09:51:05 +00009473size_t
9474Py_UNICODE_strlen(const Py_UNICODE *u)
9475{
9476 int res = 0;
9477 while(*u++)
9478 res++;
9479 return res;
9480}
9481
9482Py_UNICODE*
9483Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9484{
9485 Py_UNICODE *u = s1;
9486 while ((*u++ = *s2++));
9487 return s1;
9488}
9489
9490Py_UNICODE*
9491Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9492{
9493 Py_UNICODE *u = s1;
9494 while ((*u++ = *s2++))
9495 if (n-- == 0)
9496 break;
9497 return s1;
9498}
9499
9500int
9501Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9502{
9503 while (*s1 && *s2 && *s1 == *s2)
9504 s1++, s2++;
9505 if (*s1 && *s2)
9506 return (*s1 < *s2) ? -1 : +1;
9507 if (*s1)
9508 return 1;
9509 if (*s2)
9510 return -1;
9511 return 0;
9512}
9513
9514Py_UNICODE*
9515Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9516{
9517 const Py_UNICODE *p;
9518 for (p = s; *p; p++)
9519 if (*p == c)
9520 return (Py_UNICODE*)p;
9521 return NULL;
9522}
9523
9524
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009525#ifdef __cplusplus
9526}
9527#endif
9528
9529
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009530/*
9531Local variables:
9532c-basic-offset: 4
9533indent-tabs-mode: nil
9534End:
9535*/