blob: 5bf0fa2885537642a601a9ac0006f811bcbe03e7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
129// case 0x0009: /* HORIZONTAL TABULATION */
130// case 0x000A: /* LINE FEED */
131// case 0x000B: /* VERTICAL TABULATION */
132// case 0x000C: /* FORM FEED */
133// case 0x000D: /* CARRIAGE RETURN */
134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136// case 0x001C: /* FILE SEPARATOR */
137// case 0x001D: /* GROUP SEPARATOR */
138// case 0x001E: /* RECORD SEPARATOR */
139// case 0x001F: /* UNIT SEPARATOR */
140 0, 0, 0, 0, 1, 1, 1, 1,
141// case 0x0020: /* SPACE */
142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
160// 0x000A, /* LINE FEED */
161// 0x000D, /* CARRIAGE RETURN */
162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164// 0x001C, /* FILE SEPARATOR */
165// 0x001D, /* GROUP SEPARATOR */
166// 0x001E, /* RECORD SEPARATOR */
167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
314 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000315 if (free_list) {
316 unicode = free_list;
317 free_list = *(PyUnicodeObject **)unicode;
318 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000320 /* Keep-Alive optimization: we only upsize the buffer,
321 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000322 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000323 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000324 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000325 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000328 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000331 }
332 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 }
334 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000335 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000336 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 if (unicode == NULL)
338 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000339 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 }
342
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000343 if (!unicode->str) {
344 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000345 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000346 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000348 * the caller fails before initializing str -- unicode_resize()
349 * reads str[0], and the Keep-Alive optimization can keep memory
350 * allocated for str alive across a call to unicode_dealloc(unicode).
351 * We don't want unicode_resize to read uninitialized memory in
352 * that case.
353 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000354 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000358 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000359 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000361
362 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000363 /* XXX UNREF/NEWREF interface should be more symmetrical */
364 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000365 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000366 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368}
369
370static
Guido van Rossum9475a232001-10-05 20:51:39 +0000371void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372{
Walter Dörwald16807132007-05-25 13:52:07 +0000373 switch (PyUnicode_CHECK_INTERNED(unicode)) {
374 case SSTATE_NOT_INTERNED:
375 break;
376
377 case SSTATE_INTERNED_MORTAL:
378 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000379 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000380 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
381 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000382 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000383 break;
384
385 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000386 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000387
388 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000389 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000390 }
391
Guido van Rossum604ddf82001-12-06 20:03:56 +0000392 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000393 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000394 /* Keep-Alive optimization */
395 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000396 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 unicode->str = NULL;
398 unicode->length = 0;
399 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000400 if (unicode->defenc) {
401 Py_DECREF(unicode->defenc);
402 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000403 }
404 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000405 *(PyUnicodeObject **)unicode = free_list;
406 free_list = unicode;
407 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000410 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000411 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000412 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414}
415
Martin v. Löwis18e16552006-02-15 17:27:45 +0000416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000417{
418 register PyUnicodeObject *v;
419
420 /* Argument checks */
421 if (unicode == NULL) {
422 PyErr_BadInternalCall();
423 return -1;
424 }
425 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000426 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 PyErr_BadInternalCall();
428 return -1;
429 }
430
431 /* Resizing unicode_empty and single character objects is not
432 possible since these are being shared. We simply return a fresh
433 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000434 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 (v == unicode_empty || v->length == 1)) {
436 PyUnicodeObject *w = _PyUnicode_New(length);
437 if (w == NULL)
438 return -1;
439 Py_UNICODE_COPY(w->str, v->str,
440 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000441 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 *unicode = (PyObject *)w;
443 return 0;
444 }
445
446 /* Note that we don't have to modify *unicode for unshared Unicode
447 objects, since we can modify them in-place. */
448 return unicode_resize(v, length);
449}
450
451/* Internal API for use in unicodeobject.c only ! */
452#define _PyUnicode_Resize(unicodevar, length) \
453 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000456 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457{
458 PyUnicodeObject *unicode;
459
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 /* If the Unicode data is known at construction time, we can apply
461 some optimizations which share commonly used objects. */
462 if (u != NULL) {
463
464 /* Optimization for empty strings */
465 if (size == 0 && unicode_empty != NULL) {
466 Py_INCREF(unicode_empty);
467 return (PyObject *)unicode_empty;
468 }
469
470 /* Single character Unicode objects in the Latin-1 range are
471 shared when using this constructor */
472 if (size == 1 && *u < 256) {
473 unicode = unicode_latin1[*u];
474 if (!unicode) {
475 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476 if (!unicode)
477 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000478 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479 unicode_latin1[*u] = unicode;
480 }
481 Py_INCREF(unicode);
482 return (PyObject *)unicode;
483 }
484 }
Tim Petersced69f82003-09-16 20:30:58 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 unicode = _PyUnicode_New(size);
487 if (!unicode)
488 return NULL;
489
490 /* Copy the Unicode data into the new object */
491 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000492 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493
494 return (PyObject *)unicode;
495}
496
Walter Dörwaldd2034312007-05-18 16:29:38 +0000497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000498{
499 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000500
501 if (size < 0) {
502 PyErr_SetString(PyExc_SystemError,
503 "Negative size passed to PyUnicode_FromStringAndSize");
504 return NULL;
505 }
506
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000507 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000508 some optimizations which share commonly used objects.
509 Also, this means the input must be UTF-8, so fall back to the
510 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000511 if (u != NULL) {
512
513 /* Optimization for empty strings */
514 if (size == 0 && unicode_empty != NULL) {
515 Py_INCREF(unicode_empty);
516 return (PyObject *)unicode_empty;
517 }
518
Martin v. Löwis9c121062007-08-05 20:26:11 +0000519 /* Single characters are shared when using this constructor.
520 Restrict to ASCII, since the input must be UTF-8. */
521 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000522 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (!unicode) {
524 unicode = _PyUnicode_New(1);
525 if (!unicode)
526 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000527 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000528 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529 }
530 Py_INCREF(unicode);
531 return (PyObject *)unicode;
532 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000533
534 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000535 }
536
Walter Dörwald55507312007-05-18 13:12:10 +0000537 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 if (!unicode)
539 return NULL;
540
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000541 return (PyObject *)unicode;
542}
543
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544PyObject *PyUnicode_FromString(const char *u)
545{
546 size_t size = strlen(u);
547 if (size > PY_SSIZE_T_MAX) {
548 PyErr_SetString(PyExc_OverflowError, "input too long");
549 return NULL;
550 }
551
552 return PyUnicode_FromStringAndSize(u, size);
553}
554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555#ifdef HAVE_WCHAR_H
556
557PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000558 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559{
560 PyUnicodeObject *unicode;
561
562 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000563 if (size == 0)
564 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 PyErr_BadInternalCall();
566 return NULL;
567 }
568
Martin v. Löwis790465f2008-04-05 20:41:37 +0000569 if (size == -1) {
570 size = wcslen(w);
571 }
572
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 unicode = _PyUnicode_New(size);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578#ifdef HAVE_USABLE_WCHAR_T
579 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000580#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 {
582 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000583 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000585 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 *u++ = *w++;
587 }
588#endif
589
590 return (PyObject *)unicode;
591}
592
Walter Dörwald346737f2007-05-31 10:44:43 +0000593static void
594makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
595{
596 *fmt++ = '%';
597 if (width) {
598 if (zeropad)
599 *fmt++ = '0';
600 fmt += sprintf(fmt, "%d", width);
601 }
602 if (precision)
603 fmt += sprintf(fmt, ".%d", precision);
604 if (longflag)
605 *fmt++ = 'l';
606 else if (size_tflag) {
607 char *f = PY_FORMAT_SIZE_T;
608 while (*f)
609 *fmt++ = *f++;
610 }
611 *fmt++ = c;
612 *fmt = '\0';
613}
614
Walter Dörwaldd2034312007-05-18 16:29:38 +0000615#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
616
617PyObject *
618PyUnicode_FromFormatV(const char *format, va_list vargs)
619{
620 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000621 Py_ssize_t callcount = 0;
622 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000623 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000624 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000625 int width = 0;
626 int precision = 0;
627 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000628 const char* f;
629 Py_UNICODE *s;
630 PyObject *string;
631 /* used by sprintf */
632 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000633 /* use abuffer instead of buffer, if we need more space
634 * (which can happen if there's a format specifier with width). */
635 char *abuffer = NULL;
636 char *realbuffer;
637 Py_ssize_t abuffersize = 0;
638 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000639 const char *copy;
640
641#ifdef VA_LIST_IS_ARRAY
642 Py_MEMCPY(count, vargs, sizeof(va_list));
643#else
644#ifdef __va_copy
645 __va_copy(count, vargs);
646#else
647 count = vargs;
648#endif
649#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000650 /* step 1: count the number of %S/%R/%A format specifications
651 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
652 * these objects once during step 3 and put the result in
653 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000654 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000655 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000656 ++callcount;
657 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000658 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000659 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000660 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000661 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000662 if (!callresults) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 callresult = callresults;
667 }
668 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000669 for (f = format; *f; f++) {
670 if (*f == '%') {
671 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000672 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000673 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000674 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000675 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000676 ;
677
678 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
679 * they don't affect the amount of space we reserve.
680 */
681 if ((*f == 'l' || *f == 'z') &&
682 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000683 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000684
685 switch (*f) {
686 case 'c':
687 (void)va_arg(count, int);
688 /* fall through... */
689 case '%':
690 n++;
691 break;
692 case 'd': case 'u': case 'i': case 'x':
693 (void) va_arg(count, int);
694 /* 20 bytes is enough to hold a 64-bit
695 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000696 This isn't enough for octal.
697 If a width is specified we need more
698 (which we allocate later). */
699 if (width < 20)
700 width = 20;
701 n += width;
702 if (abuffersize < width)
703 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000704 break;
705 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000706 {
707 /* UTF-8 */
708 unsigned char*s;
709 s = va_arg(count, unsigned char*);
710 while (*s) {
711 if (*s < 128) {
712 n++; s++;
713 } else if (*s < 0xc0) {
714 /* invalid UTF-8 */
715 n++; s++;
716 } else if (*s < 0xc0) {
717 n++;
718 s++; if(!*s)break;
719 s++;
720 } else if (*s < 0xe0) {
721 n++;
722 s++; if(!*s)break;
723 s++; if(!*s)break;
724 s++;
725 } else {
726 #ifdef Py_UNICODE_WIDE
727 n++;
728 #else
729 n+=2;
730 #endif
731 s++; if(!*s)break;
732 s++; if(!*s)break;
733 s++; if(!*s)break;
734 s++;
735 }
736 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000737 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000738 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000739 case 'U':
740 {
741 PyObject *obj = va_arg(count, PyObject *);
742 assert(obj && PyUnicode_Check(obj));
743 n += PyUnicode_GET_SIZE(obj);
744 break;
745 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000746 case 'V':
747 {
748 PyObject *obj = va_arg(count, PyObject *);
749 const char *str = va_arg(count, const char *);
750 assert(obj || str);
751 assert(!obj || PyUnicode_Check(obj));
752 if (obj)
753 n += PyUnicode_GET_SIZE(obj);
754 else
755 n += strlen(str);
756 break;
757 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000758 case 'S':
759 {
760 PyObject *obj = va_arg(count, PyObject *);
761 PyObject *str;
762 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000763 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000764 if (!str)
765 goto fail;
766 n += PyUnicode_GET_SIZE(str);
767 /* Remember the str and switch to the next slot */
768 *callresult++ = str;
769 break;
770 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000771 case 'R':
772 {
773 PyObject *obj = va_arg(count, PyObject *);
774 PyObject *repr;
775 assert(obj);
776 repr = PyObject_Repr(obj);
777 if (!repr)
778 goto fail;
779 n += PyUnicode_GET_SIZE(repr);
780 /* Remember the repr and switch to the next slot */
781 *callresult++ = repr;
782 break;
783 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000784 case 'A':
785 {
786 PyObject *obj = va_arg(count, PyObject *);
787 PyObject *ascii;
788 assert(obj);
789 ascii = PyObject_ASCII(obj);
790 if (!ascii)
791 goto fail;
792 n += PyUnicode_GET_SIZE(ascii);
793 /* Remember the repr and switch to the next slot */
794 *callresult++ = ascii;
795 break;
796 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 case 'p':
798 (void) va_arg(count, int);
799 /* maximum 64-bit pointer representation:
800 * 0xffffffffffffffff
801 * so 19 characters is enough.
802 * XXX I count 18 -- what's the extra for?
803 */
804 n += 19;
805 break;
806 default:
807 /* if we stumble upon an unknown
808 formatting code, copy the rest of
809 the format string to the output
810 string. (we cannot just skip the
811 code, since there's no way to know
812 what's in the argument list) */
813 n += strlen(p);
814 goto expand;
815 }
816 } else
817 n++;
818 }
819 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000820 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000821 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000822 if (!abuffer) {
823 PyErr_NoMemory();
824 goto fail;
825 }
826 realbuffer = abuffer;
827 }
828 else
829 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000830 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000831 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000832 we don't have to resize the string.
833 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000834 string = PyUnicode_FromUnicode(NULL, n);
835 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000836 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000837
838 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000839 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000840
841 for (f = format; *f; f++) {
842 if (*f == '%') {
843 const char* p = f++;
844 int longflag = 0;
845 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000846 zeropad = (*f == '0');
847 /* parse the width.precision part */
848 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000849 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000850 width = (width*10) + *f++ - '0';
851 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000852 if (*f == '.') {
853 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000854 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000855 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000856 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000857 /* handle the long flag, but only for %ld and %lu.
858 others can be added when necessary. */
859 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
860 longflag = 1;
861 ++f;
862 }
863 /* handle the size_t flag. */
864 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
865 size_tflag = 1;
866 ++f;
867 }
868
869 switch (*f) {
870 case 'c':
871 *s++ = va_arg(vargs, int);
872 break;
873 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000874 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000875 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000876 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000877 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000878 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000879 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000880 sprintf(realbuffer, fmt, va_arg(vargs, int));
881 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 break;
883 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000884 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000885 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000886 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000888 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000889 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000890 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
891 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 break;
893 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000894 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
895 sprintf(realbuffer, fmt, va_arg(vargs, int));
896 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 break;
898 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000899 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
900 sprintf(realbuffer, fmt, va_arg(vargs, int));
901 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000902 break;
903 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000904 {
905 /* Parameter must be UTF-8 encoded.
906 In case of encoding errors, use
907 the replacement character. */
908 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000909 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000910 u = PyUnicode_DecodeUTF8(p, strlen(p),
911 "replace");
912 if (!u)
913 goto fail;
914 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
915 PyUnicode_GET_SIZE(u));
916 s += PyUnicode_GET_SIZE(u);
917 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000918 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000919 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000920 case 'U':
921 {
922 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000923 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
924 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
925 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000926 break;
927 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000928 case 'V':
929 {
930 PyObject *obj = va_arg(vargs, PyObject *);
931 const char *str = va_arg(vargs, const char *);
932 if (obj) {
933 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935 s += size;
936 } else {
937 appendstring(str);
938 }
939 break;
940 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000941 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000942 case 'R':
943 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000944 Py_UNICODE *ucopy;
945 Py_ssize_t usize;
946 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000947 /* unused, since we already have the result */
948 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000949 ucopy = PyUnicode_AS_UNICODE(*callresult);
950 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000951 for (upos = 0; upos<usize;)
952 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000953 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000954 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000955 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000956 ++callresult;
957 break;
958 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000959 case 'p':
960 sprintf(buffer, "%p", va_arg(vargs, void*));
961 /* %p is ill-defined: ensure leading 0x. */
962 if (buffer[1] == 'X')
963 buffer[1] = 'x';
964 else if (buffer[1] != 'x') {
965 memmove(buffer+2, buffer, strlen(buffer)+1);
966 buffer[0] = '0';
967 buffer[1] = 'x';
968 }
969 appendstring(buffer);
970 break;
971 case '%':
972 *s++ = '%';
973 break;
974 default:
975 appendstring(p);
976 goto end;
977 }
978 } else
979 *s++ = *f;
980 }
981
982 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000983 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000984 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000985 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000986 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000987 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
988 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000989 fail:
990 if (callresults) {
991 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000992 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000993 Py_DECREF(*callresult2);
994 ++callresult2;
995 }
Christian Heimesb186d002008-03-18 15:15:01 +0000996 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000997 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000998 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000999 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001000 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001001}
1002
1003#undef appendstring
1004
1005PyObject *
1006PyUnicode_FromFormat(const char *format, ...)
1007{
1008 PyObject* ret;
1009 va_list vargs;
1010
1011#ifdef HAVE_STDARG_PROTOTYPES
1012 va_start(vargs, format);
1013#else
1014 va_start(vargs);
1015#endif
1016 ret = PyUnicode_FromFormatV(format, vargs);
1017 va_end(vargs);
1018 return ret;
1019}
1020
Martin v. Löwis18e16552006-02-15 17:27:45 +00001021Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1022 wchar_t *w,
1023 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024{
1025 if (unicode == NULL) {
1026 PyErr_BadInternalCall();
1027 return -1;
1028 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001029
1030 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001032 size = PyUnicode_GET_SIZE(unicode) + 1;
1033
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034#ifdef HAVE_USABLE_WCHAR_T
1035 memcpy(w, unicode->str, size * sizeof(wchar_t));
1036#else
1037 {
1038 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001039 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001041 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 *w++ = *u++;
1043 }
1044#endif
1045
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001046 if (size > PyUnicode_GET_SIZE(unicode))
1047 return PyUnicode_GET_SIZE(unicode);
1048 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 return size;
1050}
1051
1052#endif
1053
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001054PyObject *PyUnicode_FromOrdinal(int ordinal)
1055{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001056 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001057
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001058 if (ordinal < 0 || ordinal > 0x10ffff) {
1059 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001060 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061 return NULL;
1062 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001063
1064#ifndef Py_UNICODE_WIDE
1065 if (ordinal > 0xffff) {
1066 ordinal -= 0x10000;
1067 s[0] = 0xD800 | (ordinal >> 10);
1068 s[1] = 0xDC00 | (ordinal & 0x3FF);
1069 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001070 }
1071#endif
1072
Hye-Shik Chang40574832004-04-06 07:24:51 +00001073 s[0] = (Py_UNICODE)ordinal;
1074 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001075}
1076
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077PyObject *PyUnicode_FromObject(register PyObject *obj)
1078{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001080 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001081 if (PyUnicode_CheckExact(obj)) {
1082 Py_INCREF(obj);
1083 return obj;
1084 }
1085 if (PyUnicode_Check(obj)) {
1086 /* For a Unicode subtype that's not a Unicode object,
1087 return a true Unicode object with the same data. */
1088 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1089 PyUnicode_GET_SIZE(obj));
1090 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001091 PyErr_Format(PyExc_TypeError,
1092 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001093 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001094 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095}
1096
1097PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1098 const char *encoding,
1099 const char *errors)
1100{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001101 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001102 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001103 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001104
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 if (obj == NULL) {
1106 PyErr_BadInternalCall();
1107 return NULL;
1108 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001110 if (PyUnicode_Check(obj)) {
1111 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001112 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001113 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001114 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001115
1116 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001117 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001118 s = PyBytes_AS_STRING(obj);
1119 len = PyBytes_GET_SIZE(obj);
1120 }
1121 else if (PyByteArray_Check(obj)) {
1122 s = PyByteArray_AS_STRING(obj);
1123 len = PyByteArray_GET_SIZE(obj);
1124 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001125 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1126 /* Overwrite the error message with something more useful in
1127 case of a TypeError. */
1128 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001129 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001130 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001131 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001132 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001133 goto onError;
1134 }
Tim Petersced69f82003-09-16 20:30:58 +00001135
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001136 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137 if (len == 0) {
1138 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001139 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 }
Tim Petersced69f82003-09-16 20:30:58 +00001141 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001142 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001143
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001144 return v;
1145
1146 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148}
1149
1150PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001151 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152 const char *encoding,
1153 const char *errors)
1154{
1155 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001156 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001157 char lower[20]; /* Enough for any encoding name we recognize */
1158 char *l;
1159 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001160
1161 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001162 encoding = PyUnicode_GetDefaultEncoding();
1163
1164 /* Convert encoding to lower case and replace '_' with '-' in order to
1165 catch e.g. UTF_8 */
1166 e = encoding;
1167 l = lower;
1168 while (*e && l < &lower[(sizeof lower) - 2]) {
1169 if (ISUPPER(*e)) {
1170 *l++ = TOLOWER(*e++);
1171 }
1172 else if (*e == '_') {
1173 *l++ = '-';
1174 e++;
1175 }
1176 else {
1177 *l++ = *e++;
1178 }
1179 }
1180 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001181
1182 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001183 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001185 else if ((strcmp(lower, "latin-1") == 0) ||
1186 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001187 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001188#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001189 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001190 return PyUnicode_DecodeMBCS(s, size, errors);
1191#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001192 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001193 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001194 else if (strcmp(lower, "utf-16") == 0)
1195 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1196 else if (strcmp(lower, "utf-32") == 0)
1197 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198
1199 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001200 buffer = NULL;
1201 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1202 goto onError;
1203 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 if (buffer == NULL)
1205 goto onError;
1206 unicode = PyCodec_Decode(buffer, encoding, errors);
1207 if (unicode == NULL)
1208 goto onError;
1209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001211 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001212 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 Py_DECREF(unicode);
1214 goto onError;
1215 }
1216 Py_DECREF(buffer);
1217 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001218
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 onError:
1220 Py_XDECREF(buffer);
1221 return NULL;
1222}
1223
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225 const char *encoding,
1226 const char *errors)
1227{
1228 PyObject *v;
1229
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_BadArgument();
1232 goto onError;
1233 }
1234
1235 if (encoding == NULL)
1236 encoding = PyUnicode_GetDefaultEncoding();
1237
1238 /* Decode via the codec registry */
1239 v = PyCodec_Decode(unicode, encoding, errors);
1240 if (v == NULL)
1241 goto onError;
1242 return v;
1243
1244 onError:
1245 return NULL;
1246}
1247
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001248PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1249 const char *encoding,
1250 const char *errors)
1251{
1252 PyObject *v;
1253
1254 if (!PyUnicode_Check(unicode)) {
1255 PyErr_BadArgument();
1256 goto onError;
1257 }
1258
1259 if (encoding == NULL)
1260 encoding = PyUnicode_GetDefaultEncoding();
1261
1262 /* Decode via the codec registry */
1263 v = PyCodec_Decode(unicode, encoding, errors);
1264 if (v == NULL)
1265 goto onError;
1266 if (!PyUnicode_Check(v)) {
1267 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001268 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001269 Py_TYPE(v)->tp_name);
1270 Py_DECREF(v);
1271 goto onError;
1272 }
1273 return v;
1274
1275 onError:
1276 return NULL;
1277}
1278
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 const char *encoding,
1282 const char *errors)
1283{
1284 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001285
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 unicode = PyUnicode_FromUnicode(s, size);
1287 if (unicode == NULL)
1288 return NULL;
1289 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1290 Py_DECREF(unicode);
1291 return v;
1292}
1293
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001294PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1295 const char *encoding,
1296 const char *errors)
1297{
1298 PyObject *v;
1299
1300 if (!PyUnicode_Check(unicode)) {
1301 PyErr_BadArgument();
1302 goto onError;
1303 }
1304
1305 if (encoding == NULL)
1306 encoding = PyUnicode_GetDefaultEncoding();
1307
1308 /* Encode via the codec registry */
1309 v = PyCodec_Encode(unicode, encoding, errors);
1310 if (v == NULL)
1311 goto onError;
1312 return v;
1313
1314 onError:
1315 return NULL;
1316}
1317
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1319 const char *encoding,
1320 const char *errors)
1321{
1322 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001323
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 if (!PyUnicode_Check(unicode)) {
1325 PyErr_BadArgument();
1326 goto onError;
1327 }
Fred Drakee4315f52000-05-09 19:53:39 +00001328
Tim Petersced69f82003-09-16 20:30:58 +00001329 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001330 encoding = PyUnicode_GetDefaultEncoding();
1331
1332 /* Shortcuts for common default encodings */
1333 if (errors == NULL) {
1334 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001335 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001336 else if (strcmp(encoding, "latin-1") == 0)
1337 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001338#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1339 else if (strcmp(encoding, "mbcs") == 0)
1340 return PyUnicode_AsMBCSString(unicode);
1341#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001342 else if (strcmp(encoding, "ascii") == 0)
1343 return PyUnicode_AsASCIIString(unicode);
1344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345
1346 /* Encode via the codec registry */
1347 v = PyCodec_Encode(unicode, encoding, errors);
1348 if (v == NULL)
1349 goto onError;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001350 if (PyByteArray_Check(v)) {
1351 char msg[100];
1352 PyOS_snprintf(msg, sizeof(msg),
1353 "encoder %s returned buffer instead of bytes",
1354 encoding);
1355 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1356 v = NULL;
1357 goto onError;
1358 }
1359 v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1360 }
1361 else if (!PyBytes_Check(v)) {
1362 PyErr_Format(PyExc_TypeError,
1363 "encoder did not return a bytes object (type=%.400s)",
1364 Py_TYPE(v)->tp_name);
1365 v = NULL;
1366 }
1367 return v;
1368
1369 onError:
1370 return NULL;
1371}
1372
1373PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1374 const char *encoding,
1375 const char *errors)
1376{
1377 PyObject *v;
1378
1379 if (!PyUnicode_Check(unicode)) {
1380 PyErr_BadArgument();
1381 goto onError;
1382 }
1383
1384 if (encoding == NULL)
1385 encoding = PyUnicode_GetDefaultEncoding();
1386
1387 /* Encode via the codec registry */
1388 v = PyCodec_Encode(unicode, encoding, errors);
1389 if (v == NULL)
1390 goto onError;
1391 if (!PyUnicode_Check(v)) {
1392 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001393 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001394 Py_TYPE(v)->tp_name);
1395 Py_DECREF(v);
1396 goto onError;
1397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001399
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400 onError:
1401 return NULL;
1402}
1403
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001404PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1405 const char *errors)
1406{
1407 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001408 if (v)
1409 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001410 if (errors != NULL)
1411 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001412 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001413 PyUnicode_GET_SIZE(unicode),
1414 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001415 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001416 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001417 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001418 return v;
1419}
1420
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001421PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001422PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001423 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001424 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1425}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001426
Christian Heimes5894ba72007-11-04 11:43:14 +00001427PyObject*
1428PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1429{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001430 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1431 can be undefined. If it is case, decode using UTF-8. The following assumes
1432 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1433 bootstrapping process where the codecs aren't ready yet.
1434 */
1435 if (Py_FileSystemDefaultEncoding) {
1436#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001437 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001438 return PyUnicode_DecodeMBCS(s, size, "replace");
1439 }
1440#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001441 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001442 return PyUnicode_DecodeUTF8(s, size, "replace");
1443 }
1444#endif
1445 return PyUnicode_Decode(s, size,
1446 Py_FileSystemDefaultEncoding,
1447 "replace");
1448 }
1449 else {
1450 return PyUnicode_DecodeUTF8(s, size, "replace");
1451 }
1452}
1453
Martin v. Löwis5b222132007-06-10 09:51:05 +00001454char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001455_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001456{
Christian Heimesf3863112007-11-22 07:46:41 +00001457 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001458 if (!PyUnicode_Check(unicode)) {
1459 PyErr_BadArgument();
1460 return NULL;
1461 }
Christian Heimesf3863112007-11-22 07:46:41 +00001462 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1463 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001464 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001465 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001466 *psize = PyBytes_GET_SIZE(bytes);
1467 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001468}
1469
1470char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001471_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001472{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001473 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001474}
1475
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1477{
1478 if (!PyUnicode_Check(unicode)) {
1479 PyErr_BadArgument();
1480 goto onError;
1481 }
1482 return PyUnicode_AS_UNICODE(unicode);
1483
1484 onError:
1485 return NULL;
1486}
1487
Martin v. Löwis18e16552006-02-15 17:27:45 +00001488Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489{
1490 if (!PyUnicode_Check(unicode)) {
1491 PyErr_BadArgument();
1492 goto onError;
1493 }
1494 return PyUnicode_GET_SIZE(unicode);
1495
1496 onError:
1497 return -1;
1498}
1499
Thomas Wouters78890102000-07-22 19:25:51 +00001500const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001501{
1502 return unicode_default_encoding;
1503}
1504
1505int PyUnicode_SetDefaultEncoding(const char *encoding)
1506{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001507 if (strcmp(encoding, unicode_default_encoding) != 0) {
1508 PyErr_Format(PyExc_ValueError,
1509 "Can only set default encoding to %s",
1510 unicode_default_encoding);
1511 return -1;
1512 }
Fred Drakee4315f52000-05-09 19:53:39 +00001513 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001514}
1515
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516/* error handling callback helper:
1517 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001518 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 and adjust various state variables.
1520 return 0 on success, -1 on error
1521*/
1522
1523static
1524int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1525 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001526 const char **input, const char **inend, Py_ssize_t *startinpos,
1527 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001528 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001530 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001531
1532 PyObject *restuple = NULL;
1533 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001534 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001535 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001536 Py_ssize_t requiredsize;
1537 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001539 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001540 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 int res = -1;
1542
1543 if (*errorHandler == NULL) {
1544 *errorHandler = PyCodec_LookupError(errors);
1545 if (*errorHandler == NULL)
1546 goto onError;
1547 }
1548
1549 if (*exceptionObject == NULL) {
1550 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001551 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 if (*exceptionObject == NULL)
1553 goto onError;
1554 }
1555 else {
1556 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1557 goto onError;
1558 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1559 goto onError;
1560 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1561 goto onError;
1562 }
1563
1564 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1565 if (restuple == NULL)
1566 goto onError;
1567 if (!PyTuple_Check(restuple)) {
1568 PyErr_Format(PyExc_TypeError, &argparse[4]);
1569 goto onError;
1570 }
1571 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1572 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001573
1574 /* Copy back the bytes variables, which might have been modified by the
1575 callback */
1576 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1577 if (!inputobj)
1578 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001579 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001580 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1581 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001582 *input = PyBytes_AS_STRING(inputobj);
1583 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001584 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001585 /* we can DECREF safely, as the exception has another reference,
1586 so the object won't go away. */
1587 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001590 newpos = insize+newpos;
1591 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001592 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001593 goto onError;
1594 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001595
1596 /* need more space? (at least enough for what we
1597 have+the replacement+the rest of the string (starting
1598 at the new input position), so we won't have to check space
1599 when there are no errors in the rest of the string) */
1600 repptr = PyUnicode_AS_UNICODE(repunicode);
1601 repsize = PyUnicode_GET_SIZE(repunicode);
1602 requiredsize = *outpos + repsize + insize-newpos;
1603 if (requiredsize > outsize) {
1604 if (requiredsize<2*outsize)
1605 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001606 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001607 goto onError;
1608 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1609 }
1610 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001611 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 Py_UNICODE_COPY(*outptr, repptr, repsize);
1613 *outptr += repsize;
1614 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001616 /* we made it! */
1617 res = 0;
1618
1619 onError:
1620 Py_XDECREF(restuple);
1621 return res;
1622}
1623
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624/* --- UTF-7 Codec -------------------------------------------------------- */
1625
1626/* see RFC2152 for details */
1627
Tim Petersced69f82003-09-16 20:30:58 +00001628static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629char utf7_special[128] = {
1630 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1631 encoded:
1632 0 - not special
1633 1 - special
1634 2 - whitespace (optional)
1635 3 - RFC2152 Set O (optional) */
1636 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1637 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1638 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1640 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1642 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1643 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1644
1645};
1646
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001647/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1648 warnings about the comparison always being false; since
1649 utf7_special[0] is 1, we can safely make that one comparison
1650 true */
1651
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001653 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001654 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655 (encodeO && (utf7_special[(c)] == 3)))
1656
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001657#define B64(n) \
1658 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1659#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001660 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001661#define UB64(c) \
1662 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1663 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001665#define ENCODE(out, ch, bits) \
1666 while (bits >= 6) { \
1667 *out++ = B64(ch >> (bits-6)); \
1668 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669 }
1670
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001671#define DECODE(out, ch, bits, surrogate) \
1672 while (bits >= 16) { \
1673 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1674 bits -= 16; \
1675 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001676 /* We have already generated an error for the high surrogate \
1677 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001678 surrogate = 0; \
1679 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001681 it in a 16-bit character */ \
1682 surrogate = 1; \
1683 errmsg = "code pairs are not supported"; \
1684 goto utf7Error; \
1685 } else { \
1686 *out++ = outCh; \
1687 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 const char *errors)
1693{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001694 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1695}
1696
1697PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1698 Py_ssize_t size,
1699 const char *errors,
1700 Py_ssize_t *consumed)
1701{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001703 Py_ssize_t startinpos;
1704 Py_ssize_t endinpos;
1705 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001706 const char *e;
1707 PyUnicodeObject *unicode;
1708 Py_UNICODE *p;
1709 const char *errmsg = "";
1710 int inShift = 0;
1711 unsigned int bitsleft = 0;
1712 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 int surrogate = 0;
1714 PyObject *errorHandler = NULL;
1715 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716
1717 unicode = _PyUnicode_New(size);
1718 if (!unicode)
1719 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001720 if (size == 0) {
1721 if (consumed)
1722 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001724 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725
1726 p = unicode->str;
1727 e = s + size;
1728
1729 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 Py_UNICODE ch;
1731 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001732 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001733
1734 if (inShift) {
1735 if ((ch == '-') || !B64CHAR(ch)) {
1736 inShift = 0;
1737 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001738
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001739 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1740 if (bitsleft >= 6) {
1741 /* The shift sequence has a partial character in it. If
1742 bitsleft < 6 then we could just classify it as padding
1743 but that is not the case here */
1744
1745 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001746 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001747 }
1748 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001749 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 here so indicate the potential of a misencoded character. */
1751
1752 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1753 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1754 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001755 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001756 }
1757
1758 if (ch == '-') {
1759 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001760 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 inShift = 1;
1762 }
1763 } else if (SPECIAL(ch,0,0)) {
1764 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001765 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001766 } else {
1767 *p++ = ch;
1768 }
1769 } else {
1770 charsleft = (charsleft << 6) | UB64(ch);
1771 bitsleft += 6;
1772 s++;
1773 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1774 }
1775 }
1776 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001777 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001778 s++;
1779 if (s < e && *s == '-') {
1780 s++;
1781 *p++ = '+';
1782 } else
1783 {
1784 inShift = 1;
1785 bitsleft = 0;
1786 }
1787 }
1788 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001789 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001790 errmsg = "unexpected special character";
1791 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001792 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001793 }
1794 else {
1795 *p++ = ch;
1796 s++;
1797 }
1798 continue;
1799 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001800 outpos = p-PyUnicode_AS_UNICODE(unicode);
1801 endinpos = s-starts;
1802 if (unicode_decode_call_errorhandler(
1803 errors, &errorHandler,
1804 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001805 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 (PyObject **)&unicode, &outpos, &p))
1807 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001808 }
1809
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001810 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 outpos = p-PyUnicode_AS_UNICODE(unicode);
1812 endinpos = size;
1813 if (unicode_decode_call_errorhandler(
1814 errors, &errorHandler,
1815 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001816 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001818 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 if (s < e)
1820 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001821 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001822 if (consumed) {
1823 if(inShift)
1824 *consumed = startinpos;
1825 else
1826 *consumed = s-starts;
1827 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001828
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001829 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830 goto onError;
1831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 Py_XDECREF(errorHandler);
1833 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 return (PyObject *)unicode;
1835
1836onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 Py_XDECREF(errorHandler);
1838 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001839 Py_DECREF(unicode);
1840 return NULL;
1841}
1842
1843
1844PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001845 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846 int encodeSetO,
1847 int encodeWhiteSpace,
1848 const char *errors)
1849{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001850 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001852 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001854 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001855 unsigned int bitsleft = 0;
1856 unsigned long charsleft = 0;
1857 char * out;
1858 char * start;
1859
1860 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001861 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001862
Christian Heimes9c4756e2008-05-26 13:22:05 +00001863 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001864 if (v == NULL)
1865 return NULL;
1866
Christian Heimes9c4756e2008-05-26 13:22:05 +00001867 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001868 for (;i < size; ++i) {
1869 Py_UNICODE ch = s[i];
1870
1871 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001872 if (ch == '+') {
1873 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001874 *out++ = '-';
1875 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1876 charsleft = ch;
1877 bitsleft = 16;
1878 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001879 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001880 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001881 } else {
1882 *out++ = (char) ch;
1883 }
1884 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001885 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1886 *out++ = B64(charsleft << (6-bitsleft));
1887 charsleft = 0;
1888 bitsleft = 0;
1889 /* Characters not in the BASE64 set implicitly unshift the sequence
1890 so no '-' is required, except if the character is itself a '-' */
1891 if (B64CHAR(ch) || ch == '-') {
1892 *out++ = '-';
1893 }
1894 inShift = 0;
1895 *out++ = (char) ch;
1896 } else {
1897 bitsleft += 16;
1898 charsleft = (charsleft << 16) | ch;
1899 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1900
1901 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001902 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 or '-' then the shift sequence will be terminated implicitly and we
1904 don't have to insert a '-'. */
1905
1906 if (bitsleft == 0) {
1907 if (i + 1 < size) {
1908 Py_UNICODE ch2 = s[i+1];
1909
1910 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001911
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001912 } else if (B64CHAR(ch2) || ch2 == '-') {
1913 *out++ = '-';
1914 inShift = 0;
1915 } else {
1916 inShift = 0;
1917 }
1918
1919 }
1920 else {
1921 *out++ = '-';
1922 inShift = 0;
1923 }
1924 }
Tim Petersced69f82003-09-16 20:30:58 +00001925 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001927 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001928 if (bitsleft) {
1929 *out++= B64(charsleft << (6-bitsleft) );
1930 *out++ = '-';
1931 }
1932
Christian Heimes72b710a2008-05-26 13:28:38 +00001933 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001934 Py_DECREF(v);
1935 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001936}
1937
1938#undef SPECIAL
1939#undef B64
1940#undef B64CHAR
1941#undef UB64
1942#undef ENCODE
1943#undef DECODE
1944
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945/* --- UTF-8 Codec -------------------------------------------------------- */
1946
Tim Petersced69f82003-09-16 20:30:58 +00001947static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948char utf8_code_length[256] = {
1949 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1950 illegal prefix. see RFC 2279 for details */
1951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1964 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1965 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1966 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1967};
1968
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001970 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 const char *errors)
1972{
Walter Dörwald69652032004-09-07 20:24:22 +00001973 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1974}
1975
1976PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001977 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001978 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001979 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001980{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001981 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001983 Py_ssize_t startinpos;
1984 Py_ssize_t endinpos;
1985 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 const char *e;
1987 PyUnicodeObject *unicode;
1988 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001989 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 PyObject *errorHandler = NULL;
1991 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992
1993 /* Note: size will always be longer than the resulting Unicode
1994 character count */
1995 unicode = _PyUnicode_New(size);
1996 if (!unicode)
1997 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001998 if (size == 0) {
1999 if (consumed)
2000 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003
2004 /* Unpack UTF-8 encoded data */
2005 p = unicode->str;
2006 e = s + size;
2007
2008 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010
2011 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002012 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 s++;
2014 continue;
2015 }
2016
2017 n = utf8_code_length[ch];
2018
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002019 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002020 if (consumed)
2021 break;
2022 else {
2023 errmsg = "unexpected end of data";
2024 startinpos = s-starts;
2025 endinpos = size;
2026 goto utf8Error;
2027 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029
2030 switch (n) {
2031
2032 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002033 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002034 startinpos = s-starts;
2035 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002036 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037
2038 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002039 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002040 startinpos = s-starts;
2041 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002042 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043
2044 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002045 if ((s[1] & 0xc0) != 0x80) {
2046 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047 startinpos = s-starts;
2048 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002049 goto utf8Error;
2050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002052 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 startinpos = s-starts;
2054 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002055 errmsg = "illegal encoding";
2056 goto utf8Error;
2057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002059 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 break;
2061
2062 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002063 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002064 (s[2] & 0xc0) != 0x80) {
2065 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 startinpos = s-starts;
2067 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002068 goto utf8Error;
2069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002071 if (ch < 0x0800) {
2072 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002073 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002074
2075 XXX For wide builds (UCS-4) we should probably try
2076 to recombine the surrogates into a single code
2077 unit.
2078 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002079 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002080 startinpos = s-starts;
2081 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002082 goto utf8Error;
2083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002086 break;
2087
2088 case 4:
2089 if ((s[1] & 0xc0) != 0x80 ||
2090 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002091 (s[3] & 0xc0) != 0x80) {
2092 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 startinpos = s-starts;
2094 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002095 goto utf8Error;
2096 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002097 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2098 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2099 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002100 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002101 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002102 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002103 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002104 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002105 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002106 startinpos = s-starts;
2107 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002108 goto utf8Error;
2109 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002110#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002111 *p++ = (Py_UNICODE)ch;
2112#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002113 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002114
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002115 /* translate from 10000..10FFFF to 0..FFFF */
2116 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002117
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002118 /* high surrogate = top 10 bits added to D800 */
2119 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002120
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002121 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002122 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002123#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 break;
2125
2126 default:
2127 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002128 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129 startinpos = s-starts;
2130 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002131 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 }
2133 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002134 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002135
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002136 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 outpos = p-PyUnicode_AS_UNICODE(unicode);
2138 if (unicode_decode_call_errorhandler(
2139 errors, &errorHandler,
2140 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002141 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 (PyObject **)&unicode, &outpos, &p))
2143 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 }
Walter Dörwald69652032004-09-07 20:24:22 +00002145 if (consumed)
2146 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147
2148 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002149 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 goto onError;
2151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 Py_XDECREF(errorHandler);
2153 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 return (PyObject *)unicode;
2155
2156onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002157 Py_XDECREF(errorHandler);
2158 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 Py_DECREF(unicode);
2160 return NULL;
2161}
2162
Tim Peters602f7402002-04-27 18:03:26 +00002163/* Allocation strategy: if the string is short, convert into a stack buffer
2164 and allocate exactly as much space needed at the end. Else allocate the
2165 maximum possible needed (4 result bytes per Unicode character), and return
2166 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002167*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002168PyObject *
2169PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002170 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002171 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172{
Tim Peters602f7402002-04-27 18:03:26 +00002173#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002174
Guido van Rossum98297ee2007-11-06 21:34:58 +00002175 Py_ssize_t i; /* index into s of next input byte */
2176 PyObject *result; /* result string object */
2177 char *p; /* next free byte in output buffer */
2178 Py_ssize_t nallocated; /* number of result bytes allocated */
2179 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002180 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002181
Tim Peters602f7402002-04-27 18:03:26 +00002182 assert(s != NULL);
2183 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184
Tim Peters602f7402002-04-27 18:03:26 +00002185 if (size <= MAX_SHORT_UNICHARS) {
2186 /* Write into the stack buffer; nallocated can't overflow.
2187 * At the end, we'll allocate exactly as much heap space as it
2188 * turns out we need.
2189 */
2190 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002191 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002192 p = stackbuf;
2193 }
2194 else {
2195 /* Overallocate on the heap, and give the excess back at the end. */
2196 nallocated = size * 4;
2197 if (nallocated / 4 != size) /* overflow! */
2198 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002199 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002200 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002201 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002202 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002203 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002204
Tim Peters602f7402002-04-27 18:03:26 +00002205 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002206 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002207
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002208 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002209 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002211
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002213 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002214 *p++ = (char)(0xc0 | (ch >> 6));
2215 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002216 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002217 else {
Tim Peters602f7402002-04-27 18:03:26 +00002218 /* Encode UCS2 Unicode ordinals */
2219 if (ch < 0x10000) {
2220 /* Special case: check for high surrogate */
2221 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2222 Py_UCS4 ch2 = s[i];
2223 /* Check for low surrogate and combine the two to
2224 form a UCS4 value */
2225 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002226 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002227 i++;
2228 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002229 }
Tim Peters602f7402002-04-27 18:03:26 +00002230 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002231 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002232 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002233 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2234 *p++ = (char)(0x80 | (ch & 0x3f));
2235 continue;
2236 }
2237encodeUCS4:
2238 /* Encode UCS4 Unicode ordinals */
2239 *p++ = (char)(0xf0 | (ch >> 18));
2240 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2241 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2242 *p++ = (char)(0x80 | (ch & 0x3f));
2243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002245
Guido van Rossum98297ee2007-11-06 21:34:58 +00002246 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002247 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002248 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002249 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002250 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002251 }
2252 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002253 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002254 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002255 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002256 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002257 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002258 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002259
Tim Peters602f7402002-04-27 18:03:26 +00002260#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261}
2262
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2264{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 if (!PyUnicode_Check(unicode)) {
2266 PyErr_BadArgument();
2267 return NULL;
2268 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002269 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2270 PyUnicode_GET_SIZE(unicode),
2271 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272}
2273
Walter Dörwald41980ca2007-08-16 21:55:45 +00002274/* --- UTF-32 Codec ------------------------------------------------------- */
2275
2276PyObject *
2277PyUnicode_DecodeUTF32(const char *s,
2278 Py_ssize_t size,
2279 const char *errors,
2280 int *byteorder)
2281{
2282 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2283}
2284
2285PyObject *
2286PyUnicode_DecodeUTF32Stateful(const char *s,
2287 Py_ssize_t size,
2288 const char *errors,
2289 int *byteorder,
2290 Py_ssize_t *consumed)
2291{
2292 const char *starts = s;
2293 Py_ssize_t startinpos;
2294 Py_ssize_t endinpos;
2295 Py_ssize_t outpos;
2296 PyUnicodeObject *unicode;
2297 Py_UNICODE *p;
2298#ifndef Py_UNICODE_WIDE
2299 int i, pairs;
2300#else
2301 const int pairs = 0;
2302#endif
2303 const unsigned char *q, *e;
2304 int bo = 0; /* assume native ordering by default */
2305 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002306 /* Offsets from q for retrieving bytes in the right order. */
2307#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2308 int iorder[] = {0, 1, 2, 3};
2309#else
2310 int iorder[] = {3, 2, 1, 0};
2311#endif
2312 PyObject *errorHandler = NULL;
2313 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002314 /* On narrow builds we split characters outside the BMP into two
2315 codepoints => count how much extra space we need. */
2316#ifndef Py_UNICODE_WIDE
2317 for (i = pairs = 0; i < size/4; i++)
2318 if (((Py_UCS4 *)s)[i] >= 0x10000)
2319 pairs++;
2320#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002321
2322 /* This might be one to much, because of a BOM */
2323 unicode = _PyUnicode_New((size+3)/4+pairs);
2324 if (!unicode)
2325 return NULL;
2326 if (size == 0)
2327 return (PyObject *)unicode;
2328
2329 /* Unpack UTF-32 encoded data */
2330 p = unicode->str;
2331 q = (unsigned char *)s;
2332 e = q + size;
2333
2334 if (byteorder)
2335 bo = *byteorder;
2336
2337 /* Check for BOM marks (U+FEFF) in the input and adjust current
2338 byte order setting accordingly. In native mode, the leading BOM
2339 mark is skipped, in all other modes, it is copied to the output
2340 stream as-is (giving a ZWNBSP character). */
2341 if (bo == 0) {
2342 if (size >= 4) {
2343 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2344 (q[iorder[1]] << 8) | q[iorder[0]];
2345#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2346 if (bom == 0x0000FEFF) {
2347 q += 4;
2348 bo = -1;
2349 }
2350 else if (bom == 0xFFFE0000) {
2351 q += 4;
2352 bo = 1;
2353 }
2354#else
2355 if (bom == 0x0000FEFF) {
2356 q += 4;
2357 bo = 1;
2358 }
2359 else if (bom == 0xFFFE0000) {
2360 q += 4;
2361 bo = -1;
2362 }
2363#endif
2364 }
2365 }
2366
2367 if (bo == -1) {
2368 /* force LE */
2369 iorder[0] = 0;
2370 iorder[1] = 1;
2371 iorder[2] = 2;
2372 iorder[3] = 3;
2373 }
2374 else if (bo == 1) {
2375 /* force BE */
2376 iorder[0] = 3;
2377 iorder[1] = 2;
2378 iorder[2] = 1;
2379 iorder[3] = 0;
2380 }
2381
2382 while (q < e) {
2383 Py_UCS4 ch;
2384 /* remaining bytes at the end? (size should be divisible by 4) */
2385 if (e-q<4) {
2386 if (consumed)
2387 break;
2388 errmsg = "truncated data";
2389 startinpos = ((const char *)q)-starts;
2390 endinpos = ((const char *)e)-starts;
2391 goto utf32Error;
2392 /* The remaining input chars are ignored if the callback
2393 chooses to skip the input */
2394 }
2395 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2396 (q[iorder[1]] << 8) | q[iorder[0]];
2397
2398 if (ch >= 0x110000)
2399 {
2400 errmsg = "codepoint not in range(0x110000)";
2401 startinpos = ((const char *)q)-starts;
2402 endinpos = startinpos+4;
2403 goto utf32Error;
2404 }
2405#ifndef Py_UNICODE_WIDE
2406 if (ch >= 0x10000)
2407 {
2408 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2409 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2410 }
2411 else
2412#endif
2413 *p++ = ch;
2414 q += 4;
2415 continue;
2416 utf32Error:
2417 outpos = p-PyUnicode_AS_UNICODE(unicode);
2418 if (unicode_decode_call_errorhandler(
2419 errors, &errorHandler,
2420 "utf32", errmsg,
2421 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2422 (PyObject **)&unicode, &outpos, &p))
2423 goto onError;
2424 }
2425
2426 if (byteorder)
2427 *byteorder = bo;
2428
2429 if (consumed)
2430 *consumed = (const char *)q-starts;
2431
2432 /* Adjust length */
2433 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2434 goto onError;
2435
2436 Py_XDECREF(errorHandler);
2437 Py_XDECREF(exc);
2438 return (PyObject *)unicode;
2439
2440onError:
2441 Py_DECREF(unicode);
2442 Py_XDECREF(errorHandler);
2443 Py_XDECREF(exc);
2444 return NULL;
2445}
2446
2447PyObject *
2448PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2449 Py_ssize_t size,
2450 const char *errors,
2451 int byteorder)
2452{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002453 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002454 unsigned char *p;
2455#ifndef Py_UNICODE_WIDE
2456 int i, pairs;
2457#else
2458 const int pairs = 0;
2459#endif
2460 /* Offsets from p for storing byte pairs in the right order. */
2461#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2462 int iorder[] = {0, 1, 2, 3};
2463#else
2464 int iorder[] = {3, 2, 1, 0};
2465#endif
2466
2467#define STORECHAR(CH) \
2468 do { \
2469 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2470 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2471 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2472 p[iorder[0]] = (CH) & 0xff; \
2473 p += 4; \
2474 } while(0)
2475
2476 /* In narrow builds we can output surrogate pairs as one codepoint,
2477 so we need less space. */
2478#ifndef Py_UNICODE_WIDE
2479 for (i = pairs = 0; i < size-1; i++)
2480 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2481 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2482 pairs++;
2483#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002484 v = PyByteArray_FromStringAndSize(NULL,
Walter Dörwald41980ca2007-08-16 21:55:45 +00002485 4 * (size - pairs + (byteorder == 0)));
2486 if (v == NULL)
2487 return NULL;
2488
Christian Heimes9c4756e2008-05-26 13:22:05 +00002489 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002490 if (byteorder == 0)
2491 STORECHAR(0xFEFF);
2492 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002493 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002494
2495 if (byteorder == -1) {
2496 /* force LE */
2497 iorder[0] = 0;
2498 iorder[1] = 1;
2499 iorder[2] = 2;
2500 iorder[3] = 3;
2501 }
2502 else if (byteorder == 1) {
2503 /* force BE */
2504 iorder[0] = 3;
2505 iorder[1] = 2;
2506 iorder[2] = 1;
2507 iorder[3] = 0;
2508 }
2509
2510 while (size-- > 0) {
2511 Py_UCS4 ch = *s++;
2512#ifndef Py_UNICODE_WIDE
2513 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2514 Py_UCS4 ch2 = *s;
2515 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2516 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2517 s++;
2518 size--;
2519 }
2520 }
2521#endif
2522 STORECHAR(ch);
2523 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002524
2525 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002526 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002527 Py_DECREF(v);
2528 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002529#undef STORECHAR
2530}
2531
2532PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2533{
2534 if (!PyUnicode_Check(unicode)) {
2535 PyErr_BadArgument();
2536 return NULL;
2537 }
2538 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2539 PyUnicode_GET_SIZE(unicode),
2540 NULL,
2541 0);
2542}
2543
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544/* --- UTF-16 Codec ------------------------------------------------------- */
2545
Tim Peters772747b2001-08-09 22:21:55 +00002546PyObject *
2547PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002548 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002549 const char *errors,
2550 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551{
Walter Dörwald69652032004-09-07 20:24:22 +00002552 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2553}
2554
2555PyObject *
2556PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002557 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002558 const char *errors,
2559 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002560 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002563 Py_ssize_t startinpos;
2564 Py_ssize_t endinpos;
2565 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 PyUnicodeObject *unicode;
2567 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002568 const unsigned char *q, *e;
2569 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002570 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002571 /* Offsets from q for retrieving byte pairs in the right order. */
2572#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2573 int ihi = 1, ilo = 0;
2574#else
2575 int ihi = 0, ilo = 1;
2576#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 PyObject *errorHandler = NULL;
2578 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579
2580 /* Note: size will always be longer than the resulting Unicode
2581 character count */
2582 unicode = _PyUnicode_New(size);
2583 if (!unicode)
2584 return NULL;
2585 if (size == 0)
2586 return (PyObject *)unicode;
2587
2588 /* Unpack UTF-16 encoded data */
2589 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002590 q = (unsigned char *)s;
2591 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592
2593 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002594 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002596 /* Check for BOM marks (U+FEFF) in the input and adjust current
2597 byte order setting accordingly. In native mode, the leading BOM
2598 mark is skipped, in all other modes, it is copied to the output
2599 stream as-is (giving a ZWNBSP character). */
2600 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002601 if (size >= 2) {
2602 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002603#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002604 if (bom == 0xFEFF) {
2605 q += 2;
2606 bo = -1;
2607 }
2608 else if (bom == 0xFFFE) {
2609 q += 2;
2610 bo = 1;
2611 }
Tim Petersced69f82003-09-16 20:30:58 +00002612#else
Walter Dörwald69652032004-09-07 20:24:22 +00002613 if (bom == 0xFEFF) {
2614 q += 2;
2615 bo = 1;
2616 }
2617 else if (bom == 0xFFFE) {
2618 q += 2;
2619 bo = -1;
2620 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002621#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002622 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002623 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624
Tim Peters772747b2001-08-09 22:21:55 +00002625 if (bo == -1) {
2626 /* force LE */
2627 ihi = 1;
2628 ilo = 0;
2629 }
2630 else if (bo == 1) {
2631 /* force BE */
2632 ihi = 0;
2633 ilo = 1;
2634 }
2635
2636 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002638 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002639 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002640 if (consumed)
2641 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002642 errmsg = "truncated data";
2643 startinpos = ((const char *)q)-starts;
2644 endinpos = ((const char *)e)-starts;
2645 goto utf16Error;
2646 /* The remaining input chars are ignored if the callback
2647 chooses to skip the input */
2648 }
2649 ch = (q[ihi] << 8) | q[ilo];
2650
Tim Peters772747b2001-08-09 22:21:55 +00002651 q += 2;
2652
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 if (ch < 0xD800 || ch > 0xDFFF) {
2654 *p++ = ch;
2655 continue;
2656 }
2657
2658 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002659 if (q >= e) {
2660 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 startinpos = (((const char *)q)-2)-starts;
2662 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002663 goto utf16Error;
2664 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002665 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002666 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2667 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002668 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002669#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002670 *p++ = ch;
2671 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002672#else
2673 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002674#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002675 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002676 }
2677 else {
2678 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 startinpos = (((const char *)q)-4)-starts;
2680 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002681 goto utf16Error;
2682 }
2683
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002685 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002686 startinpos = (((const char *)q)-2)-starts;
2687 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002688 /* Fall through to report the error */
2689
2690 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002691 outpos = p-PyUnicode_AS_UNICODE(unicode);
2692 if (unicode_decode_call_errorhandler(
2693 errors, &errorHandler,
2694 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002695 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002696 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 }
2699
2700 if (byteorder)
2701 *byteorder = bo;
2702
Walter Dörwald69652032004-09-07 20:24:22 +00002703 if (consumed)
2704 *consumed = (const char *)q-starts;
2705
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002707 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 goto onError;
2709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 Py_XDECREF(errorHandler);
2711 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 return (PyObject *)unicode;
2713
2714onError:
2715 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 Py_XDECREF(errorHandler);
2717 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 return NULL;
2719}
2720
Tim Peters772747b2001-08-09 22:21:55 +00002721PyObject *
2722PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002723 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002724 const char *errors,
2725 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002727 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002728 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002729#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002730 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002731#else
2732 const int pairs = 0;
2733#endif
Tim Peters772747b2001-08-09 22:21:55 +00002734 /* Offsets from p for storing byte pairs in the right order. */
2735#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2736 int ihi = 1, ilo = 0;
2737#else
2738 int ihi = 0, ilo = 1;
2739#endif
2740
2741#define STORECHAR(CH) \
2742 do { \
2743 p[ihi] = ((CH) >> 8) & 0xff; \
2744 p[ilo] = (CH) & 0xff; \
2745 p += 2; \
2746 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002748#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002749 for (i = pairs = 0; i < size; i++)
2750 if (s[i] >= 0x10000)
2751 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002752#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002753 v = PyByteArray_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002754 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 if (v == NULL)
2756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757
Christian Heimes9c4756e2008-05-26 13:22:05 +00002758 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002760 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002761 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002762 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002763
2764 if (byteorder == -1) {
2765 /* force LE */
2766 ihi = 1;
2767 ilo = 0;
2768 }
2769 else if (byteorder == 1) {
2770 /* force BE */
2771 ihi = 0;
2772 ilo = 1;
2773 }
2774
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002775 while (size-- > 0) {
2776 Py_UNICODE ch = *s++;
2777 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002778#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002779 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002780 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2781 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002783#endif
Tim Peters772747b2001-08-09 22:21:55 +00002784 STORECHAR(ch);
2785 if (ch2)
2786 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002787 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002788
2789 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002790 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002791 Py_DECREF(v);
2792 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002793#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794}
2795
2796PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2797{
2798 if (!PyUnicode_Check(unicode)) {
2799 PyErr_BadArgument();
2800 return NULL;
2801 }
2802 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2803 PyUnicode_GET_SIZE(unicode),
2804 NULL,
2805 0);
2806}
2807
2808/* --- Unicode Escape Codec ----------------------------------------------- */
2809
Fredrik Lundh06d12682001-01-24 07:59:11 +00002810static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002811
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002813 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 const char *errors)
2815{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002817 Py_ssize_t startinpos;
2818 Py_ssize_t endinpos;
2819 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002824 char* message;
2825 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 PyObject *errorHandler = NULL;
2827 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002828
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 /* Escaped strings will always be longer than the resulting
2830 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 length after conversion to the true value.
2832 (but if the error callback returns a long replacement string
2833 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 v = _PyUnicode_New(size);
2835 if (v == NULL)
2836 goto onError;
2837 if (size == 0)
2838 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002839
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002842
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 while (s < end) {
2844 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002845 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847
2848 /* Non-escape characters are interpreted as Unicode ordinals */
2849 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002850 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 continue;
2852 }
2853
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 /* \ - Escapes */
2856 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002857 c = *s++;
2858 if (s > end)
2859 c = '\0'; /* Invalid after \ */
2860 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861
2862 /* \x escapes */
2863 case '\n': break;
2864 case '\\': *p++ = '\\'; break;
2865 case '\'': *p++ = '\''; break;
2866 case '\"': *p++ = '\"'; break;
2867 case 'b': *p++ = '\b'; break;
2868 case 'f': *p++ = '\014'; break; /* FF */
2869 case 't': *p++ = '\t'; break;
2870 case 'n': *p++ = '\n'; break;
2871 case 'r': *p++ = '\r'; break;
2872 case 'v': *p++ = '\013'; break; /* VT */
2873 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2874
2875 /* \OOO (octal) escapes */
2876 case '0': case '1': case '2': case '3':
2877 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002878 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002879 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002880 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002881 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002882 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002884 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 break;
2886
Fredrik Lundhccc74732001-02-18 22:13:49 +00002887 /* hex escapes */
2888 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002890 digits = 2;
2891 message = "truncated \\xXX escape";
2892 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893
Fredrik Lundhccc74732001-02-18 22:13:49 +00002894 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002896 digits = 4;
2897 message = "truncated \\uXXXX escape";
2898 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899
Fredrik Lundhccc74732001-02-18 22:13:49 +00002900 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002901 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002902 digits = 8;
2903 message = "truncated \\UXXXXXXXX escape";
2904 hexescape:
2905 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002906 outpos = p-PyUnicode_AS_UNICODE(v);
2907 if (s+digits>end) {
2908 endinpos = size;
2909 if (unicode_decode_call_errorhandler(
2910 errors, &errorHandler,
2911 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002912 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 (PyObject **)&v, &outpos, &p))
2914 goto onError;
2915 goto nextByte;
2916 }
2917 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002918 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002919 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 endinpos = (s+i+1)-starts;
2921 if (unicode_decode_call_errorhandler(
2922 errors, &errorHandler,
2923 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002924 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002926 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002928 }
2929 chr = (chr<<4) & ~0xF;
2930 if (c >= '0' && c <= '9')
2931 chr += c - '0';
2932 else if (c >= 'a' && c <= 'f')
2933 chr += 10 + c - 'a';
2934 else
2935 chr += 10 + c - 'A';
2936 }
2937 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002938 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 /* _decoding_error will have already written into the
2940 target buffer. */
2941 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002942 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002943 /* when we get here, chr is a 32-bit unicode character */
2944 if (chr <= 0xffff)
2945 /* UCS-2 character */
2946 *p++ = (Py_UNICODE) chr;
2947 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002948 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002949 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002950#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002951 *p++ = chr;
2952#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002953 chr -= 0x10000L;
2954 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002955 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002956#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002957 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958 endinpos = s-starts;
2959 outpos = p-PyUnicode_AS_UNICODE(v);
2960 if (unicode_decode_call_errorhandler(
2961 errors, &errorHandler,
2962 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002964 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002965 goto onError;
2966 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002967 break;
2968
2969 /* \N{name} */
2970 case 'N':
2971 message = "malformed \\N character escape";
2972 if (ucnhash_CAPI == NULL) {
2973 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002974 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002975 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002976 if (m == NULL)
2977 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002978 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002979 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002980 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002981 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002982 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002983 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002984 if (ucnhash_CAPI == NULL)
2985 goto ucnhashError;
2986 }
2987 if (*s == '{') {
2988 const char *start = s+1;
2989 /* look for the closing brace */
2990 while (*s != '}' && s < end)
2991 s++;
2992 if (s > start && s < end && *s == '}') {
2993 /* found a name. look it up in the unicode database */
2994 message = "unknown Unicode character name";
2995 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002996 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002997 goto store;
2998 }
2999 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 endinpos = s-starts;
3001 outpos = p-PyUnicode_AS_UNICODE(v);
3002 if (unicode_decode_call_errorhandler(
3003 errors, &errorHandler,
3004 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003005 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003007 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003008 break;
3009
3010 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003011 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 message = "\\ at end of string";
3013 s--;
3014 endinpos = s-starts;
3015 outpos = p-PyUnicode_AS_UNICODE(v);
3016 if (unicode_decode_call_errorhandler(
3017 errors, &errorHandler,
3018 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003019 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003021 goto onError;
3022 }
3023 else {
3024 *p++ = '\\';
3025 *p++ = (unsigned char)s[-1];
3026 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003027 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029 nextByte:
3030 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003032 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003034 Py_XDECREF(errorHandler);
3035 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003037
Fredrik Lundhccc74732001-02-18 22:13:49 +00003038ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003039 PyErr_SetString(
3040 PyExc_UnicodeError,
3041 "\\N escapes not supported (can't load unicodedata module)"
3042 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003043 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 Py_XDECREF(errorHandler);
3045 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003046 return NULL;
3047
Fredrik Lundhccc74732001-02-18 22:13:49 +00003048onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 Py_XDECREF(errorHandler);
3051 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 return NULL;
3053}
3054
3055/* Return a Unicode-Escape string version of the Unicode object.
3056
3057 If quotes is true, the string is enclosed in u"" or u'' quotes as
3058 appropriate.
3059
3060*/
3061
Thomas Wouters477c8d52006-05-27 19:21:47 +00003062Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3063 Py_ssize_t size,
3064 Py_UNICODE ch)
3065{
3066 /* like wcschr, but doesn't stop at NULL characters */
3067
3068 while (size-- > 0) {
3069 if (*s == ch)
3070 return s;
3071 s++;
3072 }
3073
3074 return NULL;
3075}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003076
Walter Dörwald79e913e2007-05-12 11:08:06 +00003077static const char *hexdigits = "0123456789abcdef";
3078
3079PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3080 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003082 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084
Thomas Wouters89f507f2006-12-13 04:49:30 +00003085 /* XXX(nnorwitz): rather than over-allocating, it would be
3086 better to choose a different scheme. Perhaps scan the
3087 first N-chars of the string and allocate based on that size.
3088 */
3089 /* Initial allocation is based on the longest-possible unichr
3090 escape.
3091
3092 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3093 unichr, so in this case it's the longest unichr escape. In
3094 narrow (UTF-16) builds this is five chars per source unichr
3095 since there are two unichrs in the surrogate pair, so in narrow
3096 (UTF-16) builds it's not the longest unichr escape.
3097
3098 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3099 so in the narrow (UTF-16) build case it's the longest unichr
3100 escape.
3101 */
3102
Christian Heimes9c4756e2008-05-26 13:22:05 +00003103 repr = PyByteArray_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00003104#ifdef Py_UNICODE_WIDE
3105 + 10*size
3106#else
3107 + 6*size
3108#endif
3109 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 if (repr == NULL)
3111 return NULL;
3112
Christian Heimes9c4756e2008-05-26 13:22:05 +00003113 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 while (size-- > 0) {
3116 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003117
Walter Dörwald79e913e2007-05-12 11:08:06 +00003118 /* Escape backslashes */
3119 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 *p++ = '\\';
3121 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003122 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003123 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003124
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003125#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003126 /* Map 21-bit characters to '\U00xxxxxx' */
3127 else if (ch >= 0x10000) {
3128 *p++ = '\\';
3129 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003130 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3131 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3132 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3133 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3134 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3135 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3136 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3137 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003138 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003139 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003140#else
3141 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003142 else if (ch >= 0xD800 && ch < 0xDC00) {
3143 Py_UNICODE ch2;
3144 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003145
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003146 ch2 = *s++;
3147 size--;
3148 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3149 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3150 *p++ = '\\';
3151 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003152 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3153 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3154 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3155 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3156 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3157 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3158 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3159 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003160 continue;
3161 }
3162 /* Fall through: isolated surrogates are copied as-is */
3163 s--;
3164 size++;
3165 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003166#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003167
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003169 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 *p++ = '\\';
3171 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003172 *p++ = hexdigits[(ch >> 12) & 0x000F];
3173 *p++ = hexdigits[(ch >> 8) & 0x000F];
3174 *p++ = hexdigits[(ch >> 4) & 0x000F];
3175 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003177
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003178 /* Map special whitespace to '\t', \n', '\r' */
3179 else if (ch == '\t') {
3180 *p++ = '\\';
3181 *p++ = 't';
3182 }
3183 else if (ch == '\n') {
3184 *p++ = '\\';
3185 *p++ = 'n';
3186 }
3187 else if (ch == '\r') {
3188 *p++ = '\\';
3189 *p++ = 'r';
3190 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003191
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003192 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003193 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003195 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003196 *p++ = hexdigits[(ch >> 4) & 0x000F];
3197 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003198 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003199
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 /* Copy everything else as-is */
3201 else
3202 *p++ = (char) ch;
3203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204
Christian Heimes72b710a2008-05-26 13:28:38 +00003205 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003206 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003207 Py_DECREF(repr);
3208 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209}
3210
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3212{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003213 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 if (!PyUnicode_Check(unicode)) {
3215 PyErr_BadArgument();
3216 return NULL;
3217 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003218 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3219 PyUnicode_GET_SIZE(unicode));
3220
3221 if (!s)
3222 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003223 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003224 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003225 Py_DECREF(s);
3226 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227}
3228
3229/* --- Raw Unicode Escape Codec ------------------------------------------- */
3230
3231PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003232 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233 const char *errors)
3234{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003235 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003236 Py_ssize_t startinpos;
3237 Py_ssize_t endinpos;
3238 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 const char *end;
3242 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 PyObject *errorHandler = NULL;
3244 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003245
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 /* Escaped strings will always be longer than the resulting
3247 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 length after conversion to the true value. (But decoding error
3249 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 v = _PyUnicode_New(size);
3251 if (v == NULL)
3252 goto onError;
3253 if (size == 0)
3254 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 end = s + size;
3257 while (s < end) {
3258 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003259 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003261 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262
3263 /* Non-escape characters are interpreted as Unicode ordinals */
3264 if (*s != '\\') {
3265 *p++ = (unsigned char)*s++;
3266 continue;
3267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269
3270 /* \u-escapes are only interpreted iff the number of leading
3271 backslashes if odd */
3272 bs = s;
3273 for (;s < end;) {
3274 if (*s != '\\')
3275 break;
3276 *p++ = (unsigned char)*s++;
3277 }
3278 if (((s - bs) & 1) == 0 ||
3279 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003280 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 continue;
3282 }
3283 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003284 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 s++;
3286
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003287 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003289 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003291 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 endinpos = s-starts;
3293 if (unicode_decode_call_errorhandler(
3294 errors, &errorHandler,
3295 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003296 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 }
3301 x = (x<<4) & ~0xF;
3302 if (c >= '0' && c <= '9')
3303 x += c - '0';
3304 else if (c >= 'a' && c <= 'f')
3305 x += 10 + c - 'a';
3306 else
3307 x += 10 + c - 'A';
3308 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003309 if (x <= 0xffff)
3310 /* UCS-2 character */
3311 *p++ = (Py_UNICODE) x;
3312 else if (x <= 0x10ffff) {
3313 /* UCS-4 character. Either store directly, or as
3314 surrogate pair. */
3315#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003316 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003317#else
3318 x -= 0x10000L;
3319 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3320 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3321#endif
3322 } else {
3323 endinpos = s-starts;
3324 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003325 if (unicode_decode_call_errorhandler(
3326 errors, &errorHandler,
3327 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003328 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003329 (PyObject **)&v, &outpos, &p))
3330 goto onError;
3331 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003332 nextByte:
3333 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003335 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003336 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 Py_XDECREF(errorHandler);
3338 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003340
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 onError:
3342 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 Py_XDECREF(errorHandler);
3344 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 return NULL;
3346}
3347
3348PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003349 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003351 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 char *p;
3353 char *q;
3354
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003355#ifdef Py_UNICODE_WIDE
Christian Heimes9c4756e2008-05-26 13:22:05 +00003356 repr = PyByteArray_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003357#else
Christian Heimes9c4756e2008-05-26 13:22:05 +00003358 repr = PyByteArray_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003359#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 if (repr == NULL)
3361 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003362 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003363 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364
Christian Heimes9c4756e2008-05-26 13:22:05 +00003365 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 while (size-- > 0) {
3367 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003368#ifdef Py_UNICODE_WIDE
3369 /* Map 32-bit characters to '\Uxxxxxxxx' */
3370 if (ch >= 0x10000) {
3371 *p++ = '\\';
3372 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003373 *p++ = hexdigits[(ch >> 28) & 0xf];
3374 *p++ = hexdigits[(ch >> 24) & 0xf];
3375 *p++ = hexdigits[(ch >> 20) & 0xf];
3376 *p++ = hexdigits[(ch >> 16) & 0xf];
3377 *p++ = hexdigits[(ch >> 12) & 0xf];
3378 *p++ = hexdigits[(ch >> 8) & 0xf];
3379 *p++ = hexdigits[(ch >> 4) & 0xf];
3380 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003381 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003382 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003383#else
3384 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3385 if (ch >= 0xD800 && ch < 0xDC00) {
3386 Py_UNICODE ch2;
3387 Py_UCS4 ucs;
3388
3389 ch2 = *s++;
3390 size--;
3391 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3392 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3393 *p++ = '\\';
3394 *p++ = 'U';
3395 *p++ = hexdigits[(ucs >> 28) & 0xf];
3396 *p++ = hexdigits[(ucs >> 24) & 0xf];
3397 *p++ = hexdigits[(ucs >> 20) & 0xf];
3398 *p++ = hexdigits[(ucs >> 16) & 0xf];
3399 *p++ = hexdigits[(ucs >> 12) & 0xf];
3400 *p++ = hexdigits[(ucs >> 8) & 0xf];
3401 *p++ = hexdigits[(ucs >> 4) & 0xf];
3402 *p++ = hexdigits[ucs & 0xf];
3403 continue;
3404 }
3405 /* Fall through: isolated surrogates are copied as-is */
3406 s--;
3407 size++;
3408 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003409#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 /* Map 16-bit characters to '\uxxxx' */
3411 if (ch >= 256) {
3412 *p++ = '\\';
3413 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003414 *p++ = hexdigits[(ch >> 12) & 0xf];
3415 *p++ = hexdigits[(ch >> 8) & 0xf];
3416 *p++ = hexdigits[(ch >> 4) & 0xf];
3417 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418 }
3419 /* Copy everything else as-is */
3420 else
3421 *p++ = (char) ch;
3422 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003423 size = p - q;
3424
3425 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003426 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003427 Py_DECREF(repr);
3428 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429}
3430
3431PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3432{
Walter Dörwald711005d2007-05-12 12:03:26 +00003433 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003435 PyErr_BadArgument();
3436 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003438 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3439 PyUnicode_GET_SIZE(unicode));
3440
3441 if (!s)
3442 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003443 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003444 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003445 Py_DECREF(s);
3446 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447}
3448
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003449/* --- Unicode Internal Codec ------------------------------------------- */
3450
3451PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003452 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003453 const char *errors)
3454{
3455 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003456 Py_ssize_t startinpos;
3457 Py_ssize_t endinpos;
3458 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003459 PyUnicodeObject *v;
3460 Py_UNICODE *p;
3461 const char *end;
3462 const char *reason;
3463 PyObject *errorHandler = NULL;
3464 PyObject *exc = NULL;
3465
Neal Norwitzd43069c2006-01-08 01:12:10 +00003466#ifdef Py_UNICODE_WIDE
3467 Py_UNICODE unimax = PyUnicode_GetMax();
3468#endif
3469
Thomas Wouters89f507f2006-12-13 04:49:30 +00003470 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003471 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3472 if (v == NULL)
3473 goto onError;
3474 if (PyUnicode_GetSize((PyObject *)v) == 0)
3475 return (PyObject *)v;
3476 p = PyUnicode_AS_UNICODE(v);
3477 end = s + size;
3478
3479 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003480 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003481 /* We have to sanity check the raw data, otherwise doom looms for
3482 some malformed UCS-4 data. */
3483 if (
3484 #ifdef Py_UNICODE_WIDE
3485 *p > unimax || *p < 0 ||
3486 #endif
3487 end-s < Py_UNICODE_SIZE
3488 )
3489 {
3490 startinpos = s - starts;
3491 if (end-s < Py_UNICODE_SIZE) {
3492 endinpos = end-starts;
3493 reason = "truncated input";
3494 }
3495 else {
3496 endinpos = s - starts + Py_UNICODE_SIZE;
3497 reason = "illegal code point (> 0x10FFFF)";
3498 }
3499 outpos = p - PyUnicode_AS_UNICODE(v);
3500 if (unicode_decode_call_errorhandler(
3501 errors, &errorHandler,
3502 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003503 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003504 (PyObject **)&v, &outpos, &p)) {
3505 goto onError;
3506 }
3507 }
3508 else {
3509 p++;
3510 s += Py_UNICODE_SIZE;
3511 }
3512 }
3513
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003514 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003515 goto onError;
3516 Py_XDECREF(errorHandler);
3517 Py_XDECREF(exc);
3518 return (PyObject *)v;
3519
3520 onError:
3521 Py_XDECREF(v);
3522 Py_XDECREF(errorHandler);
3523 Py_XDECREF(exc);
3524 return NULL;
3525}
3526
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527/* --- Latin-1 Codec ------------------------------------------------------ */
3528
3529PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003530 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 const char *errors)
3532{
3533 PyUnicodeObject *v;
3534 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003535
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003537 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003538 Py_UNICODE r = *(unsigned char*)s;
3539 return PyUnicode_FromUnicode(&r, 1);
3540 }
3541
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 v = _PyUnicode_New(size);
3543 if (v == NULL)
3544 goto onError;
3545 if (size == 0)
3546 return (PyObject *)v;
3547 p = PyUnicode_AS_UNICODE(v);
3548 while (size-- > 0)
3549 *p++ = (unsigned char)*s++;
3550 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003551
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 onError:
3553 Py_XDECREF(v);
3554 return NULL;
3555}
3556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557/* create or adjust a UnicodeEncodeError */
3558static void make_encode_exception(PyObject **exceptionObject,
3559 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003560 const Py_UNICODE *unicode, Py_ssize_t size,
3561 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 if (*exceptionObject == NULL) {
3565 *exceptionObject = PyUnicodeEncodeError_Create(
3566 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 }
3568 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3570 goto onError;
3571 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3572 goto onError;
3573 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3574 goto onError;
3575 return;
3576 onError:
3577 Py_DECREF(*exceptionObject);
3578 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 }
3580}
3581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582/* raises a UnicodeEncodeError */
3583static void raise_encode_exception(PyObject **exceptionObject,
3584 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003585 const Py_UNICODE *unicode, Py_ssize_t size,
3586 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 const char *reason)
3588{
3589 make_encode_exception(exceptionObject,
3590 encoding, unicode, size, startpos, endpos, reason);
3591 if (*exceptionObject != NULL)
3592 PyCodec_StrictErrors(*exceptionObject);
3593}
3594
3595/* error handling callback helper:
3596 build arguments, call the callback and check the arguments,
3597 put the result into newpos and return the replacement string, which
3598 has to be freed by the caller */
3599static PyObject *unicode_encode_call_errorhandler(const char *errors,
3600 PyObject **errorHandler,
3601 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003602 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3603 Py_ssize_t startpos, Py_ssize_t endpos,
3604 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003606 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607
3608 PyObject *restuple;
3609 PyObject *resunicode;
3610
3611 if (*errorHandler == NULL) {
3612 *errorHandler = PyCodec_LookupError(errors);
3613 if (*errorHandler == NULL)
3614 return NULL;
3615 }
3616
3617 make_encode_exception(exceptionObject,
3618 encoding, unicode, size, startpos, endpos, reason);
3619 if (*exceptionObject == NULL)
3620 return NULL;
3621
3622 restuple = PyObject_CallFunctionObjArgs(
3623 *errorHandler, *exceptionObject, NULL);
3624 if (restuple == NULL)
3625 return NULL;
3626 if (!PyTuple_Check(restuple)) {
3627 PyErr_Format(PyExc_TypeError, &argparse[4]);
3628 Py_DECREF(restuple);
3629 return NULL;
3630 }
3631 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3632 &resunicode, newpos)) {
3633 Py_DECREF(restuple);
3634 return NULL;
3635 }
3636 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003637 *newpos = size+*newpos;
3638 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003639 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003640 Py_DECREF(restuple);
3641 return NULL;
3642 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 Py_INCREF(resunicode);
3644 Py_DECREF(restuple);
3645 return resunicode;
3646}
3647
3648static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003649 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 const char *errors,
3651 int limit)
3652{
3653 /* output object */
3654 PyObject *res;
3655 /* pointers to the beginning and end+1 of input */
3656 const Py_UNICODE *startp = p;
3657 const Py_UNICODE *endp = p + size;
3658 /* pointer to the beginning of the unencodable characters */
3659 /* const Py_UNICODE *badp = NULL; */
3660 /* pointer into the output */
3661 char *str;
3662 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003663 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003664 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3665 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 PyObject *errorHandler = NULL;
3667 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003668 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 /* the following variable is used for caching string comparisons
3670 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3671 int known_errorHandler = -1;
3672
3673 /* allocate enough for a simple encoding without
3674 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003675 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003676 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003677 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003679 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003680 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 ressize = size;
3682
3683 while (p<endp) {
3684 Py_UNICODE c = *p;
3685
3686 /* can we encode this? */
3687 if (c<limit) {
3688 /* no overflow check, because we know that the space is enough */
3689 *str++ = (char)c;
3690 ++p;
3691 }
3692 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003693 Py_ssize_t unicodepos = p-startp;
3694 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003696 Py_ssize_t repsize;
3697 Py_ssize_t newpos;
3698 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 Py_UNICODE *uni2;
3700 /* startpos for collecting unencodable chars */
3701 const Py_UNICODE *collstart = p;
3702 const Py_UNICODE *collend = p;
3703 /* find all unecodable characters */
3704 while ((collend < endp) && ((*collend)>=limit))
3705 ++collend;
3706 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3707 if (known_errorHandler==-1) {
3708 if ((errors==NULL) || (!strcmp(errors, "strict")))
3709 known_errorHandler = 1;
3710 else if (!strcmp(errors, "replace"))
3711 known_errorHandler = 2;
3712 else if (!strcmp(errors, "ignore"))
3713 known_errorHandler = 3;
3714 else if (!strcmp(errors, "xmlcharrefreplace"))
3715 known_errorHandler = 4;
3716 else
3717 known_errorHandler = 0;
3718 }
3719 switch (known_errorHandler) {
3720 case 1: /* strict */
3721 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3722 goto onError;
3723 case 2: /* replace */
3724 while (collstart++<collend)
3725 *str++ = '?'; /* fall through */
3726 case 3: /* ignore */
3727 p = collend;
3728 break;
3729 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003730 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 /* determine replacement size (temporarily (mis)uses p) */
3732 for (p = collstart, repsize = 0; p < collend; ++p) {
3733 if (*p<10)
3734 repsize += 2+1+1;
3735 else if (*p<100)
3736 repsize += 2+2+1;
3737 else if (*p<1000)
3738 repsize += 2+3+1;
3739 else if (*p<10000)
3740 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003741#ifndef Py_UNICODE_WIDE
3742 else
3743 repsize += 2+5+1;
3744#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 else if (*p<100000)
3746 repsize += 2+5+1;
3747 else if (*p<1000000)
3748 repsize += 2+6+1;
3749 else
3750 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003751#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 }
3753 requiredsize = respos+repsize+(endp-collend);
3754 if (requiredsize > ressize) {
3755 if (requiredsize<2*ressize)
3756 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003757 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003759 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 ressize = requiredsize;
3761 }
3762 /* generate replacement (temporarily (mis)uses p) */
3763 for (p = collstart; p < collend; ++p) {
3764 str += sprintf(str, "&#%d;", (int)*p);
3765 }
3766 p = collend;
3767 break;
3768 default:
3769 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3770 encoding, reason, startp, size, &exc,
3771 collstart-startp, collend-startp, &newpos);
3772 if (repunicode == NULL)
3773 goto onError;
3774 /* need more space? (at least enough for what we
3775 have+the replacement+the rest of the string, so
3776 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003777 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 repsize = PyUnicode_GET_SIZE(repunicode);
3779 requiredsize = respos+repsize+(endp-collend);
3780 if (requiredsize > ressize) {
3781 if (requiredsize<2*ressize)
3782 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003783 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 Py_DECREF(repunicode);
3785 goto onError;
3786 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003787 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 ressize = requiredsize;
3789 }
3790 /* check if there is anything unencodable in the replacement
3791 and copy it to the output */
3792 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3793 c = *uni2;
3794 if (c >= limit) {
3795 raise_encode_exception(&exc, encoding, startp, size,
3796 unicodepos, unicodepos+1, reason);
3797 Py_DECREF(repunicode);
3798 goto onError;
3799 }
3800 *str = (char)c;
3801 }
3802 p = startp + newpos;
3803 Py_DECREF(repunicode);
3804 }
3805 }
3806 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003807 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003808 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003809 onError:
3810 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 Py_XDECREF(errorHandler);
3812 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003813 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814}
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003817 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 const char *errors)
3819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821}
3822
3823PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3824{
3825 if (!PyUnicode_Check(unicode)) {
3826 PyErr_BadArgument();
3827 return NULL;
3828 }
3829 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3830 PyUnicode_GET_SIZE(unicode),
3831 NULL);
3832}
3833
3834/* --- 7-bit ASCII Codec -------------------------------------------------- */
3835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003837 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 const char *errors)
3839{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 PyUnicodeObject *v;
3842 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003843 Py_ssize_t startinpos;
3844 Py_ssize_t endinpos;
3845 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003846 const char *e;
3847 PyObject *errorHandler = NULL;
3848 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003849
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003851 if (size == 1 && *(unsigned char*)s < 128) {
3852 Py_UNICODE r = *(unsigned char*)s;
3853 return PyUnicode_FromUnicode(&r, 1);
3854 }
Tim Petersced69f82003-09-16 20:30:58 +00003855
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 v = _PyUnicode_New(size);
3857 if (v == NULL)
3858 goto onError;
3859 if (size == 0)
3860 return (PyObject *)v;
3861 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 e = s + size;
3863 while (s < e) {
3864 register unsigned char c = (unsigned char)*s;
3865 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 ++s;
3868 }
3869 else {
3870 startinpos = s-starts;
3871 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003872 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873 if (unicode_decode_call_errorhandler(
3874 errors, &errorHandler,
3875 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003876 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003881 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003882 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003883 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003884 Py_XDECREF(errorHandler);
3885 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003887
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 onError:
3889 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 Py_XDECREF(errorHandler);
3891 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 return NULL;
3893}
3894
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003896 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 const char *errors)
3898{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900}
3901
3902PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3903{
3904 if (!PyUnicode_Check(unicode)) {
3905 PyErr_BadArgument();
3906 return NULL;
3907 }
3908 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3909 PyUnicode_GET_SIZE(unicode),
3910 NULL);
3911}
3912
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003913#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003914
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003915/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003916
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003917#if SIZEOF_INT < SIZEOF_SSIZE_T
3918#define NEED_RETRY
3919#endif
3920
3921/* XXX This code is limited to "true" double-byte encodings, as
3922 a) it assumes an incomplete character consists of a single byte, and
3923 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3924 encodings, see IsDBCSLeadByteEx documentation. */
3925
3926static int is_dbcs_lead_byte(const char *s, int offset)
3927{
3928 const char *curr = s + offset;
3929
3930 if (IsDBCSLeadByte(*curr)) {
3931 const char *prev = CharPrev(s, curr);
3932 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3933 }
3934 return 0;
3935}
3936
3937/*
3938 * Decode MBCS string into unicode object. If 'final' is set, converts
3939 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3940 */
3941static int decode_mbcs(PyUnicodeObject **v,
3942 const char *s, /* MBCS string */
3943 int size, /* sizeof MBCS string */
3944 int final)
3945{
3946 Py_UNICODE *p;
3947 Py_ssize_t n = 0;
3948 int usize = 0;
3949
3950 assert(size >= 0);
3951
3952 /* Skip trailing lead-byte unless 'final' is set */
3953 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3954 --size;
3955
3956 /* First get the size of the result */
3957 if (size > 0) {
3958 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3959 if (usize == 0) {
3960 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3961 return -1;
3962 }
3963 }
3964
3965 if (*v == NULL) {
3966 /* Create unicode object */
3967 *v = _PyUnicode_New(usize);
3968 if (*v == NULL)
3969 return -1;
3970 }
3971 else {
3972 /* Extend unicode object */
3973 n = PyUnicode_GET_SIZE(*v);
3974 if (_PyUnicode_Resize(v, n + usize) < 0)
3975 return -1;
3976 }
3977
3978 /* Do the conversion */
3979 if (size > 0) {
3980 p = PyUnicode_AS_UNICODE(*v) + n;
3981 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3982 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3983 return -1;
3984 }
3985 }
3986
3987 return size;
3988}
3989
3990PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3991 Py_ssize_t size,
3992 const char *errors,
3993 Py_ssize_t *consumed)
3994{
3995 PyUnicodeObject *v = NULL;
3996 int done;
3997
3998 if (consumed)
3999 *consumed = 0;
4000
4001#ifdef NEED_RETRY
4002 retry:
4003 if (size > INT_MAX)
4004 done = decode_mbcs(&v, s, INT_MAX, 0);
4005 else
4006#endif
4007 done = decode_mbcs(&v, s, (int)size, !consumed);
4008
4009 if (done < 0) {
4010 Py_XDECREF(v);
4011 return NULL;
4012 }
4013
4014 if (consumed)
4015 *consumed += done;
4016
4017#ifdef NEED_RETRY
4018 if (size > INT_MAX) {
4019 s += done;
4020 size -= done;
4021 goto retry;
4022 }
4023#endif
4024
4025 return (PyObject *)v;
4026}
4027
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004028PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004029 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004030 const char *errors)
4031{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004032 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4033}
4034
4035/*
4036 * Convert unicode into string object (MBCS).
4037 * Returns 0 if succeed, -1 otherwise.
4038 */
4039static int encode_mbcs(PyObject **repr,
4040 const Py_UNICODE *p, /* unicode */
4041 int size) /* size of unicode */
4042{
4043 int mbcssize = 0;
4044 Py_ssize_t n = 0;
4045
4046 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004047
4048 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004049 if (size > 0) {
4050 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4051 if (mbcssize == 0) {
4052 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4053 return -1;
4054 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004055 }
4056
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004057 if (*repr == NULL) {
4058 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004059 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004060 if (*repr == NULL)
4061 return -1;
4062 }
4063 else {
4064 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004065 n = PyBytes_Size(*repr);
4066 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004067 return -1;
4068 }
4069
4070 /* Do the conversion */
4071 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004072 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004073 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4074 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4075 return -1;
4076 }
4077 }
4078
4079 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004080}
4081
4082PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004083 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004084 const char *errors)
4085{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004086 PyObject *repr = NULL;
4087 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004088
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004089#ifdef NEED_RETRY
4090 retry:
4091 if (size > INT_MAX)
4092 ret = encode_mbcs(&repr, p, INT_MAX);
4093 else
4094#endif
4095 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004096
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004097 if (ret < 0) {
4098 Py_XDECREF(repr);
4099 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004100 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004101
4102#ifdef NEED_RETRY
4103 if (size > INT_MAX) {
4104 p += INT_MAX;
4105 size -= INT_MAX;
4106 goto retry;
4107 }
4108#endif
4109
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004110 return repr;
4111}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004112
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004113PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4114{
4115 if (!PyUnicode_Check(unicode)) {
4116 PyErr_BadArgument();
4117 return NULL;
4118 }
4119 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4120 PyUnicode_GET_SIZE(unicode),
4121 NULL);
4122}
4123
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004124#undef NEED_RETRY
4125
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004126#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004127
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128/* --- Character Mapping Codec -------------------------------------------- */
4129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004131 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 PyObject *mapping,
4133 const char *errors)
4134{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004136 Py_ssize_t startinpos;
4137 Py_ssize_t endinpos;
4138 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 PyUnicodeObject *v;
4141 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004142 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 PyObject *errorHandler = NULL;
4144 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004145 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004146 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004147
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 /* Default to Latin-1 */
4149 if (mapping == NULL)
4150 return PyUnicode_DecodeLatin1(s, size, errors);
4151
4152 v = _PyUnicode_New(size);
4153 if (v == NULL)
4154 goto onError;
4155 if (size == 0)
4156 return (PyObject *)v;
4157 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004159 if (PyUnicode_CheckExact(mapping)) {
4160 mapstring = PyUnicode_AS_UNICODE(mapping);
4161 maplen = PyUnicode_GET_SIZE(mapping);
4162 while (s < e) {
4163 unsigned char ch = *s;
4164 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004166 if (ch < maplen)
4167 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004169 if (x == 0xfffe) {
4170 /* undefined mapping */
4171 outpos = p-PyUnicode_AS_UNICODE(v);
4172 startinpos = s-starts;
4173 endinpos = startinpos+1;
4174 if (unicode_decode_call_errorhandler(
4175 errors, &errorHandler,
4176 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004177 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004178 (PyObject **)&v, &outpos, &p)) {
4179 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004180 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004181 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004182 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004183 *p++ = x;
4184 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004186 }
4187 else {
4188 while (s < e) {
4189 unsigned char ch = *s;
4190 PyObject *w, *x;
4191
4192 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004193 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004194 if (w == NULL)
4195 goto onError;
4196 x = PyObject_GetItem(mapping, w);
4197 Py_DECREF(w);
4198 if (x == NULL) {
4199 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4200 /* No mapping found means: mapping is undefined. */
4201 PyErr_Clear();
4202 x = Py_None;
4203 Py_INCREF(x);
4204 } else
4205 goto onError;
4206 }
4207
4208 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004209 if (PyLong_Check(x)) {
4210 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004211 if (value < 0 || value > 65535) {
4212 PyErr_SetString(PyExc_TypeError,
4213 "character mapping must be in range(65536)");
4214 Py_DECREF(x);
4215 goto onError;
4216 }
4217 *p++ = (Py_UNICODE)value;
4218 }
4219 else if (x == Py_None) {
4220 /* undefined mapping */
4221 outpos = p-PyUnicode_AS_UNICODE(v);
4222 startinpos = s-starts;
4223 endinpos = startinpos+1;
4224 if (unicode_decode_call_errorhandler(
4225 errors, &errorHandler,
4226 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004227 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004228 (PyObject **)&v, &outpos, &p)) {
4229 Py_DECREF(x);
4230 goto onError;
4231 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004232 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004233 continue;
4234 }
4235 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004236 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004237
4238 if (targetsize == 1)
4239 /* 1-1 mapping */
4240 *p++ = *PyUnicode_AS_UNICODE(x);
4241
4242 else if (targetsize > 1) {
4243 /* 1-n mapping */
4244 if (targetsize > extrachars) {
4245 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004246 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4247 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004248 (targetsize << 2);
4249 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004250 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004251 if (_PyUnicode_Resize(&v,
4252 PyUnicode_GET_SIZE(v) + needed) < 0) {
4253 Py_DECREF(x);
4254 goto onError;
4255 }
4256 p = PyUnicode_AS_UNICODE(v) + oldpos;
4257 }
4258 Py_UNICODE_COPY(p,
4259 PyUnicode_AS_UNICODE(x),
4260 targetsize);
4261 p += targetsize;
4262 extrachars -= targetsize;
4263 }
4264 /* 1-0 mapping: skip the character */
4265 }
4266 else {
4267 /* wrong return value */
4268 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004269 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004270 Py_DECREF(x);
4271 goto onError;
4272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004274 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 }
4277 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004278 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 Py_XDECREF(errorHandler);
4281 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004283
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 Py_XDECREF(errorHandler);
4286 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 Py_XDECREF(v);
4288 return NULL;
4289}
4290
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004291/* Charmap encoding: the lookup table */
4292
4293struct encoding_map{
4294 PyObject_HEAD
4295 unsigned char level1[32];
4296 int count2, count3;
4297 unsigned char level23[1];
4298};
4299
4300static PyObject*
4301encoding_map_size(PyObject *obj, PyObject* args)
4302{
4303 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004304 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004305 128*map->count3);
4306}
4307
4308static PyMethodDef encoding_map_methods[] = {
4309 {"size", encoding_map_size, METH_NOARGS,
4310 PyDoc_STR("Return the size (in bytes) of this object") },
4311 { 0 }
4312};
4313
4314static void
4315encoding_map_dealloc(PyObject* o)
4316{
4317 PyObject_FREE(o);
4318}
4319
4320static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004321 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004322 "EncodingMap", /*tp_name*/
4323 sizeof(struct encoding_map), /*tp_basicsize*/
4324 0, /*tp_itemsize*/
4325 /* methods */
4326 encoding_map_dealloc, /*tp_dealloc*/
4327 0, /*tp_print*/
4328 0, /*tp_getattr*/
4329 0, /*tp_setattr*/
4330 0, /*tp_compare*/
4331 0, /*tp_repr*/
4332 0, /*tp_as_number*/
4333 0, /*tp_as_sequence*/
4334 0, /*tp_as_mapping*/
4335 0, /*tp_hash*/
4336 0, /*tp_call*/
4337 0, /*tp_str*/
4338 0, /*tp_getattro*/
4339 0, /*tp_setattro*/
4340 0, /*tp_as_buffer*/
4341 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4342 0, /*tp_doc*/
4343 0, /*tp_traverse*/
4344 0, /*tp_clear*/
4345 0, /*tp_richcompare*/
4346 0, /*tp_weaklistoffset*/
4347 0, /*tp_iter*/
4348 0, /*tp_iternext*/
4349 encoding_map_methods, /*tp_methods*/
4350 0, /*tp_members*/
4351 0, /*tp_getset*/
4352 0, /*tp_base*/
4353 0, /*tp_dict*/
4354 0, /*tp_descr_get*/
4355 0, /*tp_descr_set*/
4356 0, /*tp_dictoffset*/
4357 0, /*tp_init*/
4358 0, /*tp_alloc*/
4359 0, /*tp_new*/
4360 0, /*tp_free*/
4361 0, /*tp_is_gc*/
4362};
4363
4364PyObject*
4365PyUnicode_BuildEncodingMap(PyObject* string)
4366{
4367 Py_UNICODE *decode;
4368 PyObject *result;
4369 struct encoding_map *mresult;
4370 int i;
4371 int need_dict = 0;
4372 unsigned char level1[32];
4373 unsigned char level2[512];
4374 unsigned char *mlevel1, *mlevel2, *mlevel3;
4375 int count2 = 0, count3 = 0;
4376
4377 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4378 PyErr_BadArgument();
4379 return NULL;
4380 }
4381 decode = PyUnicode_AS_UNICODE(string);
4382 memset(level1, 0xFF, sizeof level1);
4383 memset(level2, 0xFF, sizeof level2);
4384
4385 /* If there isn't a one-to-one mapping of NULL to \0,
4386 or if there are non-BMP characters, we need to use
4387 a mapping dictionary. */
4388 if (decode[0] != 0)
4389 need_dict = 1;
4390 for (i = 1; i < 256; i++) {
4391 int l1, l2;
4392 if (decode[i] == 0
4393 #ifdef Py_UNICODE_WIDE
4394 || decode[i] > 0xFFFF
4395 #endif
4396 ) {
4397 need_dict = 1;
4398 break;
4399 }
4400 if (decode[i] == 0xFFFE)
4401 /* unmapped character */
4402 continue;
4403 l1 = decode[i] >> 11;
4404 l2 = decode[i] >> 7;
4405 if (level1[l1] == 0xFF)
4406 level1[l1] = count2++;
4407 if (level2[l2] == 0xFF)
4408 level2[l2] = count3++;
4409 }
4410
4411 if (count2 >= 0xFF || count3 >= 0xFF)
4412 need_dict = 1;
4413
4414 if (need_dict) {
4415 PyObject *result = PyDict_New();
4416 PyObject *key, *value;
4417 if (!result)
4418 return NULL;
4419 for (i = 0; i < 256; i++) {
4420 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004421 key = PyLong_FromLong(decode[i]);
4422 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004423 if (!key || !value)
4424 goto failed1;
4425 if (PyDict_SetItem(result, key, value) == -1)
4426 goto failed1;
4427 Py_DECREF(key);
4428 Py_DECREF(value);
4429 }
4430 return result;
4431 failed1:
4432 Py_XDECREF(key);
4433 Py_XDECREF(value);
4434 Py_DECREF(result);
4435 return NULL;
4436 }
4437
4438 /* Create a three-level trie */
4439 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4440 16*count2 + 128*count3 - 1);
4441 if (!result)
4442 return PyErr_NoMemory();
4443 PyObject_Init(result, &EncodingMapType);
4444 mresult = (struct encoding_map*)result;
4445 mresult->count2 = count2;
4446 mresult->count3 = count3;
4447 mlevel1 = mresult->level1;
4448 mlevel2 = mresult->level23;
4449 mlevel3 = mresult->level23 + 16*count2;
4450 memcpy(mlevel1, level1, 32);
4451 memset(mlevel2, 0xFF, 16*count2);
4452 memset(mlevel3, 0, 128*count3);
4453 count3 = 0;
4454 for (i = 1; i < 256; i++) {
4455 int o1, o2, o3, i2, i3;
4456 if (decode[i] == 0xFFFE)
4457 /* unmapped character */
4458 continue;
4459 o1 = decode[i]>>11;
4460 o2 = (decode[i]>>7) & 0xF;
4461 i2 = 16*mlevel1[o1] + o2;
4462 if (mlevel2[i2] == 0xFF)
4463 mlevel2[i2] = count3++;
4464 o3 = decode[i] & 0x7F;
4465 i3 = 128*mlevel2[i2] + o3;
4466 mlevel3[i3] = i;
4467 }
4468 return result;
4469}
4470
4471static int
4472encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4473{
4474 struct encoding_map *map = (struct encoding_map*)mapping;
4475 int l1 = c>>11;
4476 int l2 = (c>>7) & 0xF;
4477 int l3 = c & 0x7F;
4478 int i;
4479
4480#ifdef Py_UNICODE_WIDE
4481 if (c > 0xFFFF) {
4482 return -1;
4483 }
4484#endif
4485 if (c == 0)
4486 return 0;
4487 /* level 1*/
4488 i = map->level1[l1];
4489 if (i == 0xFF) {
4490 return -1;
4491 }
4492 /* level 2*/
4493 i = map->level23[16*i+l2];
4494 if (i == 0xFF) {
4495 return -1;
4496 }
4497 /* level 3 */
4498 i = map->level23[16*map->count2 + 128*i + l3];
4499 if (i == 0) {
4500 return -1;
4501 }
4502 return i;
4503}
4504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505/* Lookup the character ch in the mapping. If the character
4506 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004507 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509{
Christian Heimes217cfd12007-12-02 14:31:20 +00004510 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 PyObject *x;
4512
4513 if (w == NULL)
4514 return NULL;
4515 x = PyObject_GetItem(mapping, w);
4516 Py_DECREF(w);
4517 if (x == NULL) {
4518 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4519 /* No mapping found means: mapping is undefined. */
4520 PyErr_Clear();
4521 x = Py_None;
4522 Py_INCREF(x);
4523 return x;
4524 } else
4525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004527 else if (x == Py_None)
4528 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004529 else if (PyLong_Check(x)) {
4530 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 if (value < 0 || value > 255) {
4532 PyErr_SetString(PyExc_TypeError,
4533 "character mapping must be in range(256)");
4534 Py_DECREF(x);
4535 return NULL;
4536 }
4537 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004539 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004543 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004544 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004545 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 Py_DECREF(x);
4547 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 }
4549}
4550
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004551static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004552charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004553{
Christian Heimes72b710a2008-05-26 13:28:38 +00004554 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004555 /* exponentially overallocate to minimize reallocations */
4556 if (requiredsize < 2*outsize)
4557 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004558 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004559 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004560 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004561}
4562
4563typedef enum charmapencode_result {
4564 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4565}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004567 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 space is available. Return a new reference to the object that
4569 was put in the output buffer, or Py_None, if the mapping was undefined
4570 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004571 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004573charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004574 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004576 PyObject *rep;
4577 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004578 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579
Christian Heimes90aa7642007-12-19 02:45:37 +00004580 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004581 int res = encoding_map_lookup(c, mapping);
4582 Py_ssize_t requiredsize = *outpos+1;
4583 if (res == -1)
4584 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004585 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004586 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004587 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004588 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004589 outstart[(*outpos)++] = (char)res;
4590 return enc_SUCCESS;
4591 }
4592
4593 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004595 return enc_EXCEPTION;
4596 else if (rep==Py_None) {
4597 Py_DECREF(rep);
4598 return enc_FAILED;
4599 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004600 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004601 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004602 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004603 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004605 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004607 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004608 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 }
4610 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004611 const char *repchars = PyBytes_AS_STRING(rep);
4612 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004613 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004614 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004615 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004617 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004619 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 memcpy(outstart + *outpos, repchars, repsize);
4621 *outpos += repsize;
4622 }
4623 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004624 Py_DECREF(rep);
4625 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626}
4627
4628/* handle an error in PyUnicode_EncodeCharmap
4629 Return 0 on success, -1 on error */
4630static
4631int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004632 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004634 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004635 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636{
4637 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 Py_ssize_t repsize;
4639 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 Py_UNICODE *uni2;
4641 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004642 Py_ssize_t collstartpos = *inpos;
4643 Py_ssize_t collendpos = *inpos+1;
4644 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 char *encoding = "charmap";
4646 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004647 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 /* find all unencodable characters */
4650 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004651 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004652 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004653 int res = encoding_map_lookup(p[collendpos], mapping);
4654 if (res != -1)
4655 break;
4656 ++collendpos;
4657 continue;
4658 }
4659
4660 rep = charmapencode_lookup(p[collendpos], mapping);
4661 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004663 else if (rep!=Py_None) {
4664 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 break;
4666 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004667 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 ++collendpos;
4669 }
4670 /* cache callback name lookup
4671 * (if not done yet, i.e. it's the first error) */
4672 if (*known_errorHandler==-1) {
4673 if ((errors==NULL) || (!strcmp(errors, "strict")))
4674 *known_errorHandler = 1;
4675 else if (!strcmp(errors, "replace"))
4676 *known_errorHandler = 2;
4677 else if (!strcmp(errors, "ignore"))
4678 *known_errorHandler = 3;
4679 else if (!strcmp(errors, "xmlcharrefreplace"))
4680 *known_errorHandler = 4;
4681 else
4682 *known_errorHandler = 0;
4683 }
4684 switch (*known_errorHandler) {
4685 case 1: /* strict */
4686 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4687 return -1;
4688 case 2: /* replace */
4689 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4690 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004691 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 return -1;
4693 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004694 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4696 return -1;
4697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 }
4699 /* fall through */
4700 case 3: /* ignore */
4701 *inpos = collendpos;
4702 break;
4703 case 4: /* xmlcharrefreplace */
4704 /* generate replacement (temporarily (mis)uses p) */
4705 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4706 char buffer[2+29+1+1];
4707 char *cp;
4708 sprintf(buffer, "&#%d;", (int)p[collpos]);
4709 for (cp = buffer; *cp; ++cp) {
4710 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004711 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004713 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4715 return -1;
4716 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 }
4718 }
4719 *inpos = collendpos;
4720 break;
4721 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004722 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 encoding, reason, p, size, exceptionObject,
4724 collstartpos, collendpos, &newpos);
4725 if (repunicode == NULL)
4726 return -1;
4727 /* generate replacement */
4728 repsize = PyUnicode_GET_SIZE(repunicode);
4729 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4730 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004731 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 return -1;
4733 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004734 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4737 return -1;
4738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 }
4740 *inpos = newpos;
4741 Py_DECREF(repunicode);
4742 }
4743 return 0;
4744}
4745
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 PyObject *mapping,
4749 const char *errors)
4750{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 /* output object */
4752 PyObject *res = NULL;
4753 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004754 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004756 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 PyObject *errorHandler = NULL;
4758 PyObject *exc = NULL;
4759 /* the following variable is used for caching string comparisons
4760 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4761 * 3=ignore, 4=xmlcharrefreplace */
4762 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763
4764 /* Default to Latin-1 */
4765 if (mapping == NULL)
4766 return PyUnicode_EncodeLatin1(p, size, errors);
4767
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 /* allocate enough for a simple encoding without
4769 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004770 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 if (res == NULL)
4772 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004773 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 while (inpos<size) {
4777 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004778 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004779 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004781 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 if (charmap_encoding_error(p, size, &inpos, mapping,
4783 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004784 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004785 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004786 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004787 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 else
4790 /* done with this character => adjust input position */
4791 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004795 if (respos<PyBytes_GET_SIZE(res))
4796 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004797
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 Py_XDECREF(exc);
4799 Py_XDECREF(errorHandler);
4800 return res;
4801
4802 onError:
4803 Py_XDECREF(res);
4804 Py_XDECREF(exc);
4805 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 return NULL;
4807}
4808
4809PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4810 PyObject *mapping)
4811{
4812 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4813 PyErr_BadArgument();
4814 return NULL;
4815 }
4816 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4817 PyUnicode_GET_SIZE(unicode),
4818 mapping,
4819 NULL);
4820}
4821
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822/* create or adjust a UnicodeTranslateError */
4823static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004824 const Py_UNICODE *unicode, Py_ssize_t size,
4825 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 if (*exceptionObject == NULL) {
4829 *exceptionObject = PyUnicodeTranslateError_Create(
4830 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
4832 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4834 goto onError;
4835 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4836 goto onError;
4837 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4838 goto onError;
4839 return;
4840 onError:
4841 Py_DECREF(*exceptionObject);
4842 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 }
4844}
4845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846/* raises a UnicodeTranslateError */
4847static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004848 const Py_UNICODE *unicode, Py_ssize_t size,
4849 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 const char *reason)
4851{
4852 make_translate_exception(exceptionObject,
4853 unicode, size, startpos, endpos, reason);
4854 if (*exceptionObject != NULL)
4855 PyCodec_StrictErrors(*exceptionObject);
4856}
4857
4858/* error handling callback helper:
4859 build arguments, call the callback and check the arguments,
4860 put the result into newpos and return the replacement string, which
4861 has to be freed by the caller */
4862static PyObject *unicode_translate_call_errorhandler(const char *errors,
4863 PyObject **errorHandler,
4864 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004865 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4866 Py_ssize_t startpos, Py_ssize_t endpos,
4867 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004869 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004871 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 PyObject *restuple;
4873 PyObject *resunicode;
4874
4875 if (*errorHandler == NULL) {
4876 *errorHandler = PyCodec_LookupError(errors);
4877 if (*errorHandler == NULL)
4878 return NULL;
4879 }
4880
4881 make_translate_exception(exceptionObject,
4882 unicode, size, startpos, endpos, reason);
4883 if (*exceptionObject == NULL)
4884 return NULL;
4885
4886 restuple = PyObject_CallFunctionObjArgs(
4887 *errorHandler, *exceptionObject, NULL);
4888 if (restuple == NULL)
4889 return NULL;
4890 if (!PyTuple_Check(restuple)) {
4891 PyErr_Format(PyExc_TypeError, &argparse[4]);
4892 Py_DECREF(restuple);
4893 return NULL;
4894 }
4895 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 Py_DECREF(restuple);
4898 return NULL;
4899 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004900 if (i_newpos<0)
4901 *newpos = size+i_newpos;
4902 else
4903 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004904 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004905 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004906 Py_DECREF(restuple);
4907 return NULL;
4908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909 Py_INCREF(resunicode);
4910 Py_DECREF(restuple);
4911 return resunicode;
4912}
4913
4914/* Lookup the character ch in the mapping and put the result in result,
4915 which must be decrefed by the caller.
4916 Return 0 on success, -1 on error */
4917static
4918int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4919{
Christian Heimes217cfd12007-12-02 14:31:20 +00004920 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921 PyObject *x;
4922
4923 if (w == NULL)
4924 return -1;
4925 x = PyObject_GetItem(mapping, w);
4926 Py_DECREF(w);
4927 if (x == NULL) {
4928 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4929 /* No mapping found means: use 1:1 mapping. */
4930 PyErr_Clear();
4931 *result = NULL;
4932 return 0;
4933 } else
4934 return -1;
4935 }
4936 else if (x == Py_None) {
4937 *result = x;
4938 return 0;
4939 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004940 else if (PyLong_Check(x)) {
4941 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942 long max = PyUnicode_GetMax();
4943 if (value < 0 || value > max) {
4944 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004945 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 Py_DECREF(x);
4947 return -1;
4948 }
4949 *result = x;
4950 return 0;
4951 }
4952 else if (PyUnicode_Check(x)) {
4953 *result = x;
4954 return 0;
4955 }
4956 else {
4957 /* wrong return value */
4958 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004959 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00004960 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 return -1;
4962 }
4963}
4964/* ensure that *outobj is at least requiredsize characters long,
4965if not reallocate and adjust various state variables.
4966Return 0 on success, -1 on error */
4967static
Walter Dörwald4894c302003-10-24 14:25:28 +00004968int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004969 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004971 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004972 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004974 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004976 if (requiredsize < 2 * oldsize)
4977 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004978 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979 return -1;
4980 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981 }
4982 return 0;
4983}
4984/* lookup the character, put the result in the output string and adjust
4985 various state variables. Return a new reference to the object that
4986 was put in the output buffer in *result, or Py_None, if the mapping was
4987 undefined (in which case no character was written).
4988 The called must decref result.
4989 Return 0 on success, -1 on error. */
4990static
Walter Dörwald4894c302003-10-24 14:25:28 +00004991int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004992 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004993 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994{
Walter Dörwald4894c302003-10-24 14:25:28 +00004995 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 return -1;
4997 if (*res==NULL) {
4998 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004999 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 }
5001 else if (*res==Py_None)
5002 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005003 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005004 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005005 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 }
5007 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005008 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005009 if (repsize==1) {
5010 /* no overflow check, because we know that the space is enough */
5011 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5012 }
5013 else if (repsize!=0) {
5014 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005015 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005016 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005017 repsize - 1;
5018 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 return -1;
5020 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5021 *outp += repsize;
5022 }
5023 }
5024 else
5025 return -1;
5026 return 0;
5027}
5028
5029PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005030 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031 PyObject *mapping,
5032 const char *errors)
5033{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005034 /* output object */
5035 PyObject *res = NULL;
5036 /* pointers to the beginning and end+1 of input */
5037 const Py_UNICODE *startp = p;
5038 const Py_UNICODE *endp = p + size;
5039 /* pointer into the output */
5040 Py_UNICODE *str;
5041 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005042 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043 char *reason = "character maps to <undefined>";
5044 PyObject *errorHandler = NULL;
5045 PyObject *exc = NULL;
5046 /* the following variable is used for caching string comparisons
5047 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5048 * 3=ignore, 4=xmlcharrefreplace */
5049 int known_errorHandler = -1;
5050
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051 if (mapping == NULL) {
5052 PyErr_BadArgument();
5053 return NULL;
5054 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005055
5056 /* allocate enough for a simple 1:1 translation without
5057 replacements, if we need more, we'll resize */
5058 res = PyUnicode_FromUnicode(NULL, size);
5059 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005062 return res;
5063 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 while (p<endp) {
5066 /* try to encode it */
5067 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005068 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 goto onError;
5071 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005072 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005073 if (x!=Py_None) /* it worked => adjust input pointer */
5074 ++p;
5075 else { /* untranslatable character */
5076 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005077 Py_ssize_t repsize;
5078 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 Py_UNICODE *uni2;
5080 /* startpos for collecting untranslatable chars */
5081 const Py_UNICODE *collstart = p;
5082 const Py_UNICODE *collend = p+1;
5083 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005085 /* find all untranslatable characters */
5086 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005087 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088 goto onError;
5089 Py_XDECREF(x);
5090 if (x!=Py_None)
5091 break;
5092 ++collend;
5093 }
5094 /* cache callback name lookup
5095 * (if not done yet, i.e. it's the first error) */
5096 if (known_errorHandler==-1) {
5097 if ((errors==NULL) || (!strcmp(errors, "strict")))
5098 known_errorHandler = 1;
5099 else if (!strcmp(errors, "replace"))
5100 known_errorHandler = 2;
5101 else if (!strcmp(errors, "ignore"))
5102 known_errorHandler = 3;
5103 else if (!strcmp(errors, "xmlcharrefreplace"))
5104 known_errorHandler = 4;
5105 else
5106 known_errorHandler = 0;
5107 }
5108 switch (known_errorHandler) {
5109 case 1: /* strict */
5110 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5111 goto onError;
5112 case 2: /* replace */
5113 /* No need to check for space, this is a 1:1 replacement */
5114 for (coll = collstart; coll<collend; ++coll)
5115 *str++ = '?';
5116 /* fall through */
5117 case 3: /* ignore */
5118 p = collend;
5119 break;
5120 case 4: /* xmlcharrefreplace */
5121 /* generate replacement (temporarily (mis)uses p) */
5122 for (p = collstart; p < collend; ++p) {
5123 char buffer[2+29+1+1];
5124 char *cp;
5125 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005126 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005127 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5128 goto onError;
5129 for (cp = buffer; *cp; ++cp)
5130 *str++ = *cp;
5131 }
5132 p = collend;
5133 break;
5134 default:
5135 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5136 reason, startp, size, &exc,
5137 collstart-startp, collend-startp, &newpos);
5138 if (repunicode == NULL)
5139 goto onError;
5140 /* generate replacement */
5141 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005142 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005143 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5144 Py_DECREF(repunicode);
5145 goto onError;
5146 }
5147 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5148 *str++ = *uni2;
5149 p = startp + newpos;
5150 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 }
5152 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005154 /* Resize if we allocated to much */
5155 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005156 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005157 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005158 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005159 }
5160 Py_XDECREF(exc);
5161 Py_XDECREF(errorHandler);
5162 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 onError:
5165 Py_XDECREF(res);
5166 Py_XDECREF(exc);
5167 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 return NULL;
5169}
5170
5171PyObject *PyUnicode_Translate(PyObject *str,
5172 PyObject *mapping,
5173 const char *errors)
5174{
5175 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005176
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 str = PyUnicode_FromObject(str);
5178 if (str == NULL)
5179 goto onError;
5180 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5181 PyUnicode_GET_SIZE(str),
5182 mapping,
5183 errors);
5184 Py_DECREF(str);
5185 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 onError:
5188 Py_XDECREF(str);
5189 return NULL;
5190}
Tim Petersced69f82003-09-16 20:30:58 +00005191
Guido van Rossum9e896b32000-04-05 20:11:21 +00005192/* --- Decimal Encoder ---------------------------------------------------- */
5193
5194int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005195 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005196 char *output,
5197 const char *errors)
5198{
5199 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 PyObject *errorHandler = NULL;
5201 PyObject *exc = NULL;
5202 const char *encoding = "decimal";
5203 const char *reason = "invalid decimal Unicode string";
5204 /* the following variable is used for caching string comparisons
5205 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5206 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005207
5208 if (output == NULL) {
5209 PyErr_BadArgument();
5210 return -1;
5211 }
5212
5213 p = s;
5214 end = s + length;
5215 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005217 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005219 Py_ssize_t repsize;
5220 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005221 Py_UNICODE *uni2;
5222 Py_UNICODE *collstart;
5223 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005224
Guido van Rossum9e896b32000-04-05 20:11:21 +00005225 if (Py_UNICODE_ISSPACE(ch)) {
5226 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005227 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005228 continue;
5229 }
5230 decimal = Py_UNICODE_TODECIMAL(ch);
5231 if (decimal >= 0) {
5232 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005233 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005234 continue;
5235 }
Guido van Rossumba477042000-04-06 18:18:10 +00005236 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005237 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005239 continue;
5240 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 /* All other characters are considered unencodable */
5242 collstart = p;
5243 collend = p+1;
5244 while (collend < end) {
5245 if ((0 < *collend && *collend < 256) ||
5246 !Py_UNICODE_ISSPACE(*collend) ||
5247 Py_UNICODE_TODECIMAL(*collend))
5248 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005249 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 /* cache callback name lookup
5251 * (if not done yet, i.e. it's the first error) */
5252 if (known_errorHandler==-1) {
5253 if ((errors==NULL) || (!strcmp(errors, "strict")))
5254 known_errorHandler = 1;
5255 else if (!strcmp(errors, "replace"))
5256 known_errorHandler = 2;
5257 else if (!strcmp(errors, "ignore"))
5258 known_errorHandler = 3;
5259 else if (!strcmp(errors, "xmlcharrefreplace"))
5260 known_errorHandler = 4;
5261 else
5262 known_errorHandler = 0;
5263 }
5264 switch (known_errorHandler) {
5265 case 1: /* strict */
5266 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5267 goto onError;
5268 case 2: /* replace */
5269 for (p = collstart; p < collend; ++p)
5270 *output++ = '?';
5271 /* fall through */
5272 case 3: /* ignore */
5273 p = collend;
5274 break;
5275 case 4: /* xmlcharrefreplace */
5276 /* generate replacement (temporarily (mis)uses p) */
5277 for (p = collstart; p < collend; ++p)
5278 output += sprintf(output, "&#%d;", (int)*p);
5279 p = collend;
5280 break;
5281 default:
5282 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5283 encoding, reason, s, length, &exc,
5284 collstart-s, collend-s, &newpos);
5285 if (repunicode == NULL)
5286 goto onError;
5287 /* generate replacement */
5288 repsize = PyUnicode_GET_SIZE(repunicode);
5289 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5290 Py_UNICODE ch = *uni2;
5291 if (Py_UNICODE_ISSPACE(ch))
5292 *output++ = ' ';
5293 else {
5294 decimal = Py_UNICODE_TODECIMAL(ch);
5295 if (decimal >= 0)
5296 *output++ = '0' + decimal;
5297 else if (0 < ch && ch < 256)
5298 *output++ = (char)ch;
5299 else {
5300 Py_DECREF(repunicode);
5301 raise_encode_exception(&exc, encoding,
5302 s, length, collstart-s, collend-s, reason);
5303 goto onError;
5304 }
5305 }
5306 }
5307 p = s + newpos;
5308 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005309 }
5310 }
5311 /* 0-terminate the output string */
5312 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313 Py_XDECREF(exc);
5314 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005315 return 0;
5316
5317 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005318 Py_XDECREF(exc);
5319 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005320 return -1;
5321}
5322
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323/* --- Helpers ------------------------------------------------------------ */
5324
Eric Smith8c663262007-08-25 02:26:07 +00005325#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005326#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005327#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005328/* Include _ParseTupleFinds from find.h */
5329#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005330#include "stringlib/find.h"
5331#include "stringlib/partition.h"
5332
Eric Smith5807c412008-05-11 21:00:57 +00005333#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5334#include "stringlib/localeutil.h"
5335
Thomas Wouters477c8d52006-05-27 19:21:47 +00005336/* helper macro to fixup start/end slice values */
5337#define FIX_START_END(obj) \
5338 if (start < 0) \
5339 start += (obj)->length; \
5340 if (start < 0) \
5341 start = 0; \
5342 if (end > (obj)->length) \
5343 end = (obj)->length; \
5344 if (end < 0) \
5345 end += (obj)->length; \
5346 if (end < 0) \
5347 end = 0;
5348
Martin v. Löwis18e16552006-02-15 17:27:45 +00005349Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005350 PyObject *substr,
5351 Py_ssize_t start,
5352 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005355 PyUnicodeObject* str_obj;
5356 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005357
Thomas Wouters477c8d52006-05-27 19:21:47 +00005358 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5359 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005361 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5362 if (!sub_obj) {
5363 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 return -1;
5365 }
Tim Petersced69f82003-09-16 20:30:58 +00005366
Thomas Wouters477c8d52006-05-27 19:21:47 +00005367 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005368
Thomas Wouters477c8d52006-05-27 19:21:47 +00005369 result = stringlib_count(
5370 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5371 );
5372
5373 Py_DECREF(sub_obj);
5374 Py_DECREF(str_obj);
5375
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 return result;
5377}
5378
Martin v. Löwis18e16552006-02-15 17:27:45 +00005379Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005380 PyObject *sub,
5381 Py_ssize_t start,
5382 Py_ssize_t end,
5383 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005388 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005389 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005390 sub = PyUnicode_FromObject(sub);
5391 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005392 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005393 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 }
Tim Petersced69f82003-09-16 20:30:58 +00005395
Thomas Wouters477c8d52006-05-27 19:21:47 +00005396 if (direction > 0)
5397 result = stringlib_find_slice(
5398 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5399 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5400 start, end
5401 );
5402 else
5403 result = stringlib_rfind_slice(
5404 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5405 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5406 start, end
5407 );
5408
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 Py_DECREF(sub);
5411
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 return result;
5413}
5414
Tim Petersced69f82003-09-16 20:30:58 +00005415static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416int tailmatch(PyUnicodeObject *self,
5417 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005418 Py_ssize_t start,
5419 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 int direction)
5421{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 if (substring->length == 0)
5423 return 1;
5424
Thomas Wouters477c8d52006-05-27 19:21:47 +00005425 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
5427 end -= substring->length;
5428 if (end < start)
5429 return 0;
5430
5431 if (direction > 0) {
5432 if (Py_UNICODE_MATCH(self, end, substring))
5433 return 1;
5434 } else {
5435 if (Py_UNICODE_MATCH(self, start, substring))
5436 return 1;
5437 }
5438
5439 return 0;
5440}
5441
Martin v. Löwis18e16552006-02-15 17:27:45 +00005442Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005444 Py_ssize_t start,
5445 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 int direction)
5447{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005448 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 str = PyUnicode_FromObject(str);
5451 if (str == NULL)
5452 return -1;
5453 substr = PyUnicode_FromObject(substr);
5454 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005455 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 return -1;
5457 }
Tim Petersced69f82003-09-16 20:30:58 +00005458
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 result = tailmatch((PyUnicodeObject *)str,
5460 (PyUnicodeObject *)substr,
5461 start, end, direction);
5462 Py_DECREF(str);
5463 Py_DECREF(substr);
5464 return result;
5465}
5466
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467/* Apply fixfct filter to the Unicode object self and return a
5468 reference to the modified object */
5469
Tim Petersced69f82003-09-16 20:30:58 +00005470static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471PyObject *fixup(PyUnicodeObject *self,
5472 int (*fixfct)(PyUnicodeObject *s))
5473{
5474
5475 PyUnicodeObject *u;
5476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005477 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 if (u == NULL)
5479 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005480
5481 Py_UNICODE_COPY(u->str, self->str, self->length);
5482
Tim Peters7a29bd52001-09-12 03:03:31 +00005483 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 /* fixfct should return TRUE if it modified the buffer. If
5485 FALSE, return a reference to the original buffer instead
5486 (to save space, not time) */
5487 Py_INCREF(self);
5488 Py_DECREF(u);
5489 return (PyObject*) self;
5490 }
5491 return (PyObject*) u;
5492}
5493
Tim Petersced69f82003-09-16 20:30:58 +00005494static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495int fixupper(PyUnicodeObject *self)
5496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005497 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 Py_UNICODE *s = self->str;
5499 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005500
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 while (len-- > 0) {
5502 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005503
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 ch = Py_UNICODE_TOUPPER(*s);
5505 if (ch != *s) {
5506 status = 1;
5507 *s = ch;
5508 }
5509 s++;
5510 }
5511
5512 return status;
5513}
5514
Tim Petersced69f82003-09-16 20:30:58 +00005515static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516int fixlower(PyUnicodeObject *self)
5517{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005518 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 Py_UNICODE *s = self->str;
5520 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005521
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 while (len-- > 0) {
5523 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 ch = Py_UNICODE_TOLOWER(*s);
5526 if (ch != *s) {
5527 status = 1;
5528 *s = ch;
5529 }
5530 s++;
5531 }
5532
5533 return status;
5534}
5535
Tim Petersced69f82003-09-16 20:30:58 +00005536static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537int fixswapcase(PyUnicodeObject *self)
5538{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005539 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 Py_UNICODE *s = self->str;
5541 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 while (len-- > 0) {
5544 if (Py_UNICODE_ISUPPER(*s)) {
5545 *s = Py_UNICODE_TOLOWER(*s);
5546 status = 1;
5547 } else if (Py_UNICODE_ISLOWER(*s)) {
5548 *s = Py_UNICODE_TOUPPER(*s);
5549 status = 1;
5550 }
5551 s++;
5552 }
5553
5554 return status;
5555}
5556
Tim Petersced69f82003-09-16 20:30:58 +00005557static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558int fixcapitalize(PyUnicodeObject *self)
5559{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005560 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005561 Py_UNICODE *s = self->str;
5562 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005563
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005564 if (len == 0)
5565 return 0;
5566 if (Py_UNICODE_ISLOWER(*s)) {
5567 *s = Py_UNICODE_TOUPPER(*s);
5568 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005570 s++;
5571 while (--len > 0) {
5572 if (Py_UNICODE_ISUPPER(*s)) {
5573 *s = Py_UNICODE_TOLOWER(*s);
5574 status = 1;
5575 }
5576 s++;
5577 }
5578 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579}
5580
5581static
5582int fixtitle(PyUnicodeObject *self)
5583{
5584 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5585 register Py_UNICODE *e;
5586 int previous_is_cased;
5587
5588 /* Shortcut for single character strings */
5589 if (PyUnicode_GET_SIZE(self) == 1) {
5590 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5591 if (*p != ch) {
5592 *p = ch;
5593 return 1;
5594 }
5595 else
5596 return 0;
5597 }
Tim Petersced69f82003-09-16 20:30:58 +00005598
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 e = p + PyUnicode_GET_SIZE(self);
5600 previous_is_cased = 0;
5601 for (; p < e; p++) {
5602 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005603
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 if (previous_is_cased)
5605 *p = Py_UNICODE_TOLOWER(ch);
5606 else
5607 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005608
5609 if (Py_UNICODE_ISLOWER(ch) ||
5610 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 Py_UNICODE_ISTITLE(ch))
5612 previous_is_cased = 1;
5613 else
5614 previous_is_cased = 0;
5615 }
5616 return 1;
5617}
5618
Tim Peters8ce9f162004-08-27 01:49:32 +00005619PyObject *
5620PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621{
Tim Peters8ce9f162004-08-27 01:49:32 +00005622 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005623 const Py_UNICODE blank = ' ';
5624 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005625 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005626 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005627 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5628 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005629 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5630 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005631 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005632 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005633 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634
Tim Peters05eba1f2004-08-27 21:32:02 +00005635 fseq = PySequence_Fast(seq, "");
5636 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005637 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005638 }
5639
Tim Peters91879ab2004-08-27 22:35:44 +00005640 /* Grrrr. A codec may be invoked to convert str objects to
5641 * Unicode, and so it's possible to call back into Python code
5642 * during PyUnicode_FromObject(), and so it's possible for a sick
5643 * codec to change the size of fseq (if seq is a list). Therefore
5644 * we have to keep refetching the size -- can't assume seqlen
5645 * is invariant.
5646 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005647 seqlen = PySequence_Fast_GET_SIZE(fseq);
5648 /* If empty sequence, return u"". */
5649 if (seqlen == 0) {
5650 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5651 goto Done;
5652 }
5653 /* If singleton sequence with an exact Unicode, return that. */
5654 if (seqlen == 1) {
5655 item = PySequence_Fast_GET_ITEM(fseq, 0);
5656 if (PyUnicode_CheckExact(item)) {
5657 Py_INCREF(item);
5658 res = (PyUnicodeObject *)item;
5659 goto Done;
5660 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005661 }
5662
Tim Peters05eba1f2004-08-27 21:32:02 +00005663 /* At least two items to join, or one that isn't exact Unicode. */
5664 if (seqlen > 1) {
5665 /* Set up sep and seplen -- they're needed. */
5666 if (separator == NULL) {
5667 sep = &blank;
5668 seplen = 1;
5669 }
5670 else {
5671 internal_separator = PyUnicode_FromObject(separator);
5672 if (internal_separator == NULL)
5673 goto onError;
5674 sep = PyUnicode_AS_UNICODE(internal_separator);
5675 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005676 /* In case PyUnicode_FromObject() mutated seq. */
5677 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005678 }
5679 }
5680
5681 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005682 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005683 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005684 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005685 res_p = PyUnicode_AS_UNICODE(res);
5686 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005687
Tim Peters05eba1f2004-08-27 21:32:02 +00005688 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005689 Py_ssize_t itemlen;
5690 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005691
5692 item = PySequence_Fast_GET_ITEM(fseq, i);
5693 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005694 if (!PyUnicode_Check(item)) {
5695 PyErr_Format(PyExc_TypeError,
5696 "sequence item %zd: expected str instance,"
5697 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005698 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005699 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005700 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005701 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005702 if (item == NULL)
5703 goto onError;
5704 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005705
Tim Peters91879ab2004-08-27 22:35:44 +00005706 /* In case PyUnicode_FromObject() mutated seq. */
5707 seqlen = PySequence_Fast_GET_SIZE(fseq);
5708
Tim Peters8ce9f162004-08-27 01:49:32 +00005709 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005711 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005712 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005713 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005714 if (i < seqlen - 1) {
5715 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005716 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005717 goto Overflow;
5718 }
5719 if (new_res_used > res_alloc) {
5720 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005721 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005722 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005723 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005724 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005725 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005726 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005727 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005729 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005730 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005732
5733 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005734 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005735 res_p += itemlen;
5736 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005737 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005738 res_p += seplen;
5739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005741 res_used = new_res_used;
5742 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005743
Tim Peters05eba1f2004-08-27 21:32:02 +00005744 /* Shrink res to match the used area; this probably can't fail,
5745 * but it's cheap to check.
5746 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005747 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005748 goto onError;
5749
5750 Done:
5751 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005752 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 return (PyObject *)res;
5754
Tim Peters8ce9f162004-08-27 01:49:32 +00005755 Overflow:
5756 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005757 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005758 Py_DECREF(item);
5759 /* fall through */
5760
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005762 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005763 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005764 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 return NULL;
5766}
5767
Tim Petersced69f82003-09-16 20:30:58 +00005768static
5769PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005770 Py_ssize_t left,
5771 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 Py_UNICODE fill)
5773{
5774 PyUnicodeObject *u;
5775
5776 if (left < 0)
5777 left = 0;
5778 if (right < 0)
5779 right = 0;
5780
Tim Peters7a29bd52001-09-12 03:03:31 +00005781 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 Py_INCREF(self);
5783 return self;
5784 }
5785
5786 u = _PyUnicode_New(left + self->length + right);
5787 if (u) {
5788 if (left)
5789 Py_UNICODE_FILL(u->str, fill, left);
5790 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5791 if (right)
5792 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5793 }
5794
5795 return u;
5796}
5797
5798#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 if (!str) \
5801 goto onError; \
5802 if (PyList_Append(list, str)) { \
5803 Py_DECREF(str); \
5804 goto onError; \
5805 } \
5806 else \
5807 Py_DECREF(str);
5808
5809static
5810PyObject *split_whitespace(PyUnicodeObject *self,
5811 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005812 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 register Py_ssize_t i;
5815 register Py_ssize_t j;
5816 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005818 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
5820 for (i = j = 0; i < len; ) {
5821 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005822 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 i++;
5824 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005825 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 i++;
5827 if (j < i) {
5828 if (maxcount-- <= 0)
5829 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005830 SPLIT_APPEND(buf, j, i);
5831 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 i++;
5833 j = i;
5834 }
5835 }
5836 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005837 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 }
5839 return list;
5840
5841 onError:
5842 Py_DECREF(list);
5843 return NULL;
5844}
5845
5846PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005847 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005849 register Py_ssize_t i;
5850 register Py_ssize_t j;
5851 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 PyObject *list;
5853 PyObject *str;
5854 Py_UNICODE *data;
5855
5856 string = PyUnicode_FromObject(string);
5857 if (string == NULL)
5858 return NULL;
5859 data = PyUnicode_AS_UNICODE(string);
5860 len = PyUnicode_GET_SIZE(string);
5861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 list = PyList_New(0);
5863 if (!list)
5864 goto onError;
5865
5866 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005867 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005868
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005870 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872
5873 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005874 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 if (i < len) {
5876 if (data[i] == '\r' && i + 1 < len &&
5877 data[i+1] == '\n')
5878 i += 2;
5879 else
5880 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005881 if (keepends)
5882 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 }
Guido van Rossum86662912000-04-11 15:38:46 +00005884 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 j = i;
5886 }
5887 if (j < len) {
5888 SPLIT_APPEND(data, j, len);
5889 }
5890
5891 Py_DECREF(string);
5892 return list;
5893
5894 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005895 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 Py_DECREF(string);
5897 return NULL;
5898}
5899
Tim Petersced69f82003-09-16 20:30:58 +00005900static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901PyObject *split_char(PyUnicodeObject *self,
5902 PyObject *list,
5903 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005904 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005906 register Py_ssize_t i;
5907 register Py_ssize_t j;
5908 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005910 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911
5912 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005913 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 if (maxcount-- <= 0)
5915 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005916 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 i = j = i + 1;
5918 } else
5919 i++;
5920 }
5921 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005922 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 }
5924 return list;
5925
5926 onError:
5927 Py_DECREF(list);
5928 return NULL;
5929}
5930
Tim Petersced69f82003-09-16 20:30:58 +00005931static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932PyObject *split_substring(PyUnicodeObject *self,
5933 PyObject *list,
5934 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005935 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005937 register Py_ssize_t i;
5938 register Py_ssize_t j;
5939 Py_ssize_t len = self->length;
5940 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 PyObject *str;
5942
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005943 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 if (Py_UNICODE_MATCH(self, i, substring)) {
5945 if (maxcount-- <= 0)
5946 break;
5947 SPLIT_APPEND(self->str, j, i);
5948 i = j = i + sublen;
5949 } else
5950 i++;
5951 }
5952 if (j <= len) {
5953 SPLIT_APPEND(self->str, j, len);
5954 }
5955 return list;
5956
5957 onError:
5958 Py_DECREF(list);
5959 return NULL;
5960}
5961
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005962static
5963PyObject *rsplit_whitespace(PyUnicodeObject *self,
5964 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005965 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005966{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005967 register Py_ssize_t i;
5968 register Py_ssize_t j;
5969 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005970 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005971 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005972
5973 for (i = j = len - 1; i >= 0; ) {
5974 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005975 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005976 i--;
5977 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005978 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005979 i--;
5980 if (j > i) {
5981 if (maxcount-- <= 0)
5982 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005983 SPLIT_APPEND(buf, i + 1, j + 1);
5984 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005985 i--;
5986 j = i;
5987 }
5988 }
5989 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005990 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005991 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005992 if (PyList_Reverse(list) < 0)
5993 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005994 return list;
5995
5996 onError:
5997 Py_DECREF(list);
5998 return NULL;
5999}
6000
6001static
6002PyObject *rsplit_char(PyUnicodeObject *self,
6003 PyObject *list,
6004 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006005 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006006{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006007 register Py_ssize_t i;
6008 register Py_ssize_t j;
6009 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006010 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006011 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006012
6013 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006014 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006015 if (maxcount-- <= 0)
6016 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006017 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006018 j = i = i - 1;
6019 } else
6020 i--;
6021 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006022 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006023 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006024 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006025 if (PyList_Reverse(list) < 0)
6026 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006027 return list;
6028
6029 onError:
6030 Py_DECREF(list);
6031 return NULL;
6032}
6033
6034static
6035PyObject *rsplit_substring(PyUnicodeObject *self,
6036 PyObject *list,
6037 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006038 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006039{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006040 register Py_ssize_t i;
6041 register Py_ssize_t j;
6042 Py_ssize_t len = self->length;
6043 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006044 PyObject *str;
6045
6046 for (i = len - sublen, j = len; i >= 0; ) {
6047 if (Py_UNICODE_MATCH(self, i, substring)) {
6048 if (maxcount-- <= 0)
6049 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006050 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006051 j = i;
6052 i -= sublen;
6053 } else
6054 i--;
6055 }
6056 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006057 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006058 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006059 if (PyList_Reverse(list) < 0)
6060 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006061 return list;
6062
6063 onError:
6064 Py_DECREF(list);
6065 return NULL;
6066}
6067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068#undef SPLIT_APPEND
6069
6070static
6071PyObject *split(PyUnicodeObject *self,
6072 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006073 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074{
6075 PyObject *list;
6076
6077 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006078 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079
6080 list = PyList_New(0);
6081 if (!list)
6082 return NULL;
6083
6084 if (substring == NULL)
6085 return split_whitespace(self,list,maxcount);
6086
6087 else if (substring->length == 1)
6088 return split_char(self,list,substring->str[0],maxcount);
6089
6090 else if (substring->length == 0) {
6091 Py_DECREF(list);
6092 PyErr_SetString(PyExc_ValueError, "empty separator");
6093 return NULL;
6094 }
6095 else
6096 return split_substring(self,list,substring,maxcount);
6097}
6098
Tim Petersced69f82003-09-16 20:30:58 +00006099static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006100PyObject *rsplit(PyUnicodeObject *self,
6101 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006102 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006103{
6104 PyObject *list;
6105
6106 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006107 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006108
6109 list = PyList_New(0);
6110 if (!list)
6111 return NULL;
6112
6113 if (substring == NULL)
6114 return rsplit_whitespace(self,list,maxcount);
6115
6116 else if (substring->length == 1)
6117 return rsplit_char(self,list,substring->str[0],maxcount);
6118
6119 else if (substring->length == 0) {
6120 Py_DECREF(list);
6121 PyErr_SetString(PyExc_ValueError, "empty separator");
6122 return NULL;
6123 }
6124 else
6125 return rsplit_substring(self,list,substring,maxcount);
6126}
6127
6128static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129PyObject *replace(PyUnicodeObject *self,
6130 PyUnicodeObject *str1,
6131 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006132 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133{
6134 PyUnicodeObject *u;
6135
6136 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006137 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Thomas Wouters477c8d52006-05-27 19:21:47 +00006139 if (str1->length == str2->length) {
6140 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006141 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006142 if (str1->length == 1) {
6143 /* replace characters */
6144 Py_UNICODE u1, u2;
6145 if (!findchar(self->str, self->length, str1->str[0]))
6146 goto nothing;
6147 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6148 if (!u)
6149 return NULL;
6150 Py_UNICODE_COPY(u->str, self->str, self->length);
6151 u1 = str1->str[0];
6152 u2 = str2->str[0];
6153 for (i = 0; i < u->length; i++)
6154 if (u->str[i] == u1) {
6155 if (--maxcount < 0)
6156 break;
6157 u->str[i] = u2;
6158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006160 i = fastsearch(
6161 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006163 if (i < 0)
6164 goto nothing;
6165 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6166 if (!u)
6167 return NULL;
6168 Py_UNICODE_COPY(u->str, self->str, self->length);
6169 while (i <= self->length - str1->length)
6170 if (Py_UNICODE_MATCH(self, i, str1)) {
6171 if (--maxcount < 0)
6172 break;
6173 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6174 i += str1->length;
6175 } else
6176 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006179
6180 Py_ssize_t n, i, j, e;
6181 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 Py_UNICODE *p;
6183
6184 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006185 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 if (n > maxcount)
6187 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006188 if (n == 0)
6189 goto nothing;
6190 /* new_size = self->length + n * (str2->length - str1->length)); */
6191 delta = (str2->length - str1->length);
6192 if (delta == 0) {
6193 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006195 product = n * (str2->length - str1->length);
6196 if ((product / (str2->length - str1->length)) != n) {
6197 PyErr_SetString(PyExc_OverflowError,
6198 "replace string is too long");
6199 return NULL;
6200 }
6201 new_size = self->length + product;
6202 if (new_size < 0) {
6203 PyErr_SetString(PyExc_OverflowError,
6204 "replace string is too long");
6205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 }
6207 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006208 u = _PyUnicode_New(new_size);
6209 if (!u)
6210 return NULL;
6211 i = 0;
6212 p = u->str;
6213 e = self->length - str1->length;
6214 if (str1->length > 0) {
6215 while (n-- > 0) {
6216 /* look for next match */
6217 j = i;
6218 while (j <= e) {
6219 if (Py_UNICODE_MATCH(self, j, str1))
6220 break;
6221 j++;
6222 }
6223 if (j > i) {
6224 if (j > e)
6225 break;
6226 /* copy unchanged part [i:j] */
6227 Py_UNICODE_COPY(p, self->str+i, j-i);
6228 p += j - i;
6229 }
6230 /* copy substitution string */
6231 if (str2->length > 0) {
6232 Py_UNICODE_COPY(p, str2->str, str2->length);
6233 p += str2->length;
6234 }
6235 i = j + str1->length;
6236 }
6237 if (i < self->length)
6238 /* copy tail [i:] */
6239 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6240 } else {
6241 /* interleave */
6242 while (n > 0) {
6243 Py_UNICODE_COPY(p, str2->str, str2->length);
6244 p += str2->length;
6245 if (--n <= 0)
6246 break;
6247 *p++ = self->str[i++];
6248 }
6249 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006253
6254nothing:
6255 /* nothing to replace; return original string (when possible) */
6256 if (PyUnicode_CheckExact(self)) {
6257 Py_INCREF(self);
6258 return (PyObject *) self;
6259 }
6260 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261}
6262
6263/* --- Unicode Object Methods --------------------------------------------- */
6264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006265PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006266"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267\n\
6268Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006269characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270
6271static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006272unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 return fixup(self, fixtitle);
6275}
6276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006277PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006278"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279\n\
6280Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006281have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282
6283static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006284unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 return fixup(self, fixcapitalize);
6287}
6288
6289#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006290PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006291"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292\n\
6293Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006294normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
6296static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006297unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298{
6299 PyObject *list;
6300 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006301 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 /* Split into words */
6304 list = split(self, NULL, -1);
6305 if (!list)
6306 return NULL;
6307
6308 /* Capitalize each word */
6309 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6310 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6311 fixcapitalize);
6312 if (item == NULL)
6313 goto onError;
6314 Py_DECREF(PyList_GET_ITEM(list, i));
6315 PyList_SET_ITEM(list, i, item);
6316 }
6317
6318 /* Join the words to form a new string */
6319 item = PyUnicode_Join(NULL, list);
6320
6321onError:
6322 Py_DECREF(list);
6323 return (PyObject *)item;
6324}
6325#endif
6326
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006327/* Argument converter. Coerces to a single unicode character */
6328
6329static int
6330convert_uc(PyObject *obj, void *addr)
6331{
6332 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6333 PyObject *uniobj;
6334 Py_UNICODE *unistr;
6335
6336 uniobj = PyUnicode_FromObject(obj);
6337 if (uniobj == NULL) {
6338 PyErr_SetString(PyExc_TypeError,
6339 "The fill character cannot be converted to Unicode");
6340 return 0;
6341 }
6342 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6343 PyErr_SetString(PyExc_TypeError,
6344 "The fill character must be exactly one character long");
6345 Py_DECREF(uniobj);
6346 return 0;
6347 }
6348 unistr = PyUnicode_AS_UNICODE(uniobj);
6349 *fillcharloc = unistr[0];
6350 Py_DECREF(uniobj);
6351 return 1;
6352}
6353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006354PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006355"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006357Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006358done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359
6360static PyObject *
6361unicode_center(PyUnicodeObject *self, PyObject *args)
6362{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006363 Py_ssize_t marg, left;
6364 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006365 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366
Thomas Woutersde017742006-02-16 19:34:37 +00006367 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 return NULL;
6369
Tim Peters7a29bd52001-09-12 03:03:31 +00006370 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 Py_INCREF(self);
6372 return (PyObject*) self;
6373 }
6374
6375 marg = width - self->length;
6376 left = marg / 2 + (marg & width & 1);
6377
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006378 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379}
6380
Marc-André Lemburge5034372000-08-08 08:04:29 +00006381#if 0
6382
6383/* This code should go into some future Unicode collation support
6384 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006385 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006386
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006387/* speedy UTF-16 code point order comparison */
6388/* gleaned from: */
6389/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6390
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006391static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006392{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006393 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006394 0, 0, 0, 0, 0, 0, 0, 0,
6395 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006396 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006397};
6398
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399static int
6400unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6401{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006403
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 Py_UNICODE *s1 = str1->str;
6405 Py_UNICODE *s2 = str2->str;
6406
6407 len1 = str1->length;
6408 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006409
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006411 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006412
6413 c1 = *s1++;
6414 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006415
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006416 if (c1 > (1<<11) * 26)
6417 c1 += utf16Fixup[c1>>11];
6418 if (c2 > (1<<11) * 26)
6419 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006420 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006421
6422 if (c1 != c2)
6423 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006424
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006425 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 }
6427
6428 return (len1 < len2) ? -1 : (len1 != len2);
6429}
6430
Marc-André Lemburge5034372000-08-08 08:04:29 +00006431#else
6432
6433static int
6434unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6435{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006436 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006437
6438 Py_UNICODE *s1 = str1->str;
6439 Py_UNICODE *s2 = str2->str;
6440
6441 len1 = str1->length;
6442 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006443
Marc-André Lemburge5034372000-08-08 08:04:29 +00006444 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006445 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006446
Fredrik Lundh45714e92001-06-26 16:39:36 +00006447 c1 = *s1++;
6448 c2 = *s2++;
6449
6450 if (c1 != c2)
6451 return (c1 < c2) ? -1 : 1;
6452
Marc-André Lemburge5034372000-08-08 08:04:29 +00006453 len1--; len2--;
6454 }
6455
6456 return (len1 < len2) ? -1 : (len1 != len2);
6457}
6458
6459#endif
6460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461int PyUnicode_Compare(PyObject *left,
6462 PyObject *right)
6463{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006464 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6465 return unicode_compare((PyUnicodeObject *)left,
6466 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006467 PyErr_Format(PyExc_TypeError,
6468 "Can't compare %.100s and %.100s",
6469 left->ob_type->tp_name,
6470 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 return -1;
6472}
6473
Martin v. Löwis5b222132007-06-10 09:51:05 +00006474int
6475PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6476{
6477 int i;
6478 Py_UNICODE *id;
6479 assert(PyUnicode_Check(uni));
6480 id = PyUnicode_AS_UNICODE(uni);
6481 /* Compare Unicode string and source character set string */
6482 for (i = 0; id[i] && str[i]; i++)
6483 if (id[i] != str[i])
6484 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6485 if (id[i])
6486 return 1; /* uni is longer */
6487 if (str[i])
6488 return -1; /* str is longer */
6489 return 0;
6490}
6491
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006492PyObject *PyUnicode_RichCompare(PyObject *left,
6493 PyObject *right,
6494 int op)
6495{
6496 int result;
6497
6498 result = PyUnicode_Compare(left, right);
6499 if (result == -1 && PyErr_Occurred())
6500 goto onError;
6501
6502 /* Convert the return value to a Boolean */
6503 switch (op) {
6504 case Py_EQ:
6505 result = (result == 0);
6506 break;
6507 case Py_NE:
6508 result = (result != 0);
6509 break;
6510 case Py_LE:
6511 result = (result <= 0);
6512 break;
6513 case Py_GE:
6514 result = (result >= 0);
6515 break;
6516 case Py_LT:
6517 result = (result == -1);
6518 break;
6519 case Py_GT:
6520 result = (result == 1);
6521 break;
6522 }
6523 return PyBool_FromLong(result);
6524
6525 onError:
6526
6527 /* Standard case
6528
6529 Type errors mean that PyUnicode_FromObject() could not convert
6530 one of the arguments (usually the right hand side) to Unicode,
6531 ie. we can't handle the comparison request. However, it is
6532 possible that the other object knows a comparison method, which
6533 is why we return Py_NotImplemented to give the other object a
6534 chance.
6535
6536 */
6537 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6538 PyErr_Clear();
6539 Py_INCREF(Py_NotImplemented);
6540 return Py_NotImplemented;
6541 }
6542 if (op != Py_EQ && op != Py_NE)
6543 return NULL;
6544
6545 /* Equality comparison.
6546
6547 This is a special case: we silence any PyExc_UnicodeDecodeError
6548 and instead turn it into a PyErr_UnicodeWarning.
6549
6550 */
6551 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6552 return NULL;
6553 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006554 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6555 (op == Py_EQ) ?
Benjamin Peterson142957c2008-07-04 19:55:29 +00006556 "equal comparison "
6557 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006558 "interpreting them as being unequal"
6559 :
6560 "Unicode unequal comparison "
Benjamin Peterson142957c2008-07-04 19:55:29 +00006561 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006562 "interpreting them as being unequal",
6563 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006564 return NULL;
6565 result = (op == Py_NE);
6566 return PyBool_FromLong(result);
6567}
6568
Guido van Rossum403d68b2000-03-13 15:55:09 +00006569int PyUnicode_Contains(PyObject *container,
6570 PyObject *element)
6571{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006572 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006573 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006574
6575 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006576 sub = PyUnicode_FromObject(element);
6577 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006578 PyErr_Format(PyExc_TypeError,
6579 "'in <string>' requires string as left operand, not %s",
6580 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006581 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006582 }
6583
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584 str = PyUnicode_FromObject(container);
6585 if (!str) {
6586 Py_DECREF(sub);
6587 return -1;
6588 }
6589
6590 result = stringlib_contains_obj(str, sub);
6591
6592 Py_DECREF(str);
6593 Py_DECREF(sub);
6594
Guido van Rossum403d68b2000-03-13 15:55:09 +00006595 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006596}
6597
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598/* Concat to string or Unicode object giving a new Unicode object. */
6599
6600PyObject *PyUnicode_Concat(PyObject *left,
6601 PyObject *right)
6602{
6603 PyUnicodeObject *u = NULL, *v = NULL, *w;
6604
6605 /* Coerce the two arguments */
6606 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6607 if (u == NULL)
6608 goto onError;
6609 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6610 if (v == NULL)
6611 goto onError;
6612
6613 /* Shortcuts */
6614 if (v == unicode_empty) {
6615 Py_DECREF(v);
6616 return (PyObject *)u;
6617 }
6618 if (u == unicode_empty) {
6619 Py_DECREF(u);
6620 return (PyObject *)v;
6621 }
6622
6623 /* Concat the two Unicode strings */
6624 w = _PyUnicode_New(u->length + v->length);
6625 if (w == NULL)
6626 goto onError;
6627 Py_UNICODE_COPY(w->str, u->str, u->length);
6628 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6629
6630 Py_DECREF(u);
6631 Py_DECREF(v);
6632 return (PyObject *)w;
6633
6634onError:
6635 Py_XDECREF(u);
6636 Py_XDECREF(v);
6637 return NULL;
6638}
6639
Walter Dörwald1ab83302007-05-18 17:15:44 +00006640void
6641PyUnicode_Append(PyObject **pleft, PyObject *right)
6642{
6643 PyObject *new;
6644 if (*pleft == NULL)
6645 return;
6646 if (right == NULL || !PyUnicode_Check(*pleft)) {
6647 Py_DECREF(*pleft);
6648 *pleft = NULL;
6649 return;
6650 }
6651 new = PyUnicode_Concat(*pleft, right);
6652 Py_DECREF(*pleft);
6653 *pleft = new;
6654}
6655
6656void
6657PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6658{
6659 PyUnicode_Append(pleft, right);
6660 Py_XDECREF(right);
6661}
6662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664"S.count(sub[, start[, end]]) -> int\n\
6665\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006666Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006667string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669
6670static PyObject *
6671unicode_count(PyUnicodeObject *self, PyObject *args)
6672{
6673 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006675 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 PyObject *result;
6677
Guido van Rossumb8872e62000-05-09 14:14:27 +00006678 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6679 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 return NULL;
6681
6682 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006683 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 if (substring == NULL)
6685 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006686
Thomas Wouters477c8d52006-05-27 19:21:47 +00006687 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
Christian Heimes217cfd12007-12-02 14:31:20 +00006689 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006690 stringlib_count(self->str + start, end - start,
6691 substring->str, substring->length)
6692 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
6694 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006695
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 return result;
6697}
6698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006699PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006700"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006702Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006703to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006704handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6706'xmlcharrefreplace' as well as any other name registered with\n\
6707codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708
6709static PyObject *
6710unicode_encode(PyUnicodeObject *self, PyObject *args)
6711{
6712 char *encoding = NULL;
6713 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006714 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006715
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6717 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006718 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006719 if (v == NULL)
6720 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006721 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006722 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006723 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006724 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006725 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006726 Py_DECREF(v);
6727 return NULL;
6728 }
6729 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006730
6731 onError:
6732 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006733}
6734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006735PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006736"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737\n\
6738Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006739If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
6741static PyObject*
6742unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6743{
6744 Py_UNICODE *e;
6745 Py_UNICODE *p;
6746 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006747 Py_UNICODE *qe;
6748 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 PyUnicodeObject *u;
6750 int tabsize = 8;
6751
6752 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6753 return NULL;
6754
Thomas Wouters7e474022000-07-16 12:04:32 +00006755 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006756 i = 0; /* chars up to and including most recent \n or \r */
6757 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6758 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 for (p = self->str; p < e; p++)
6760 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006761 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006762 incr = tabsize - (j % tabsize); /* cannot overflow */
6763 if (j > PY_SSIZE_T_MAX - incr)
6764 goto overflow1;
6765 j += incr;
6766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
6768 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006769 if (j > PY_SSIZE_T_MAX - 1)
6770 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 j++;
6772 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006773 if (i > PY_SSIZE_T_MAX - j)
6774 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006776 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 }
6778 }
6779
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006780 if (i > PY_SSIZE_T_MAX - j)
6781 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006782
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 /* Second pass: create output string and fill it */
6784 u = _PyUnicode_New(i + j);
6785 if (!u)
6786 return NULL;
6787
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006788 j = 0; /* same as in first pass */
6789 q = u->str; /* next output char */
6790 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791
6792 for (p = self->str; p < e; p++)
6793 if (*p == '\t') {
6794 if (tabsize > 0) {
6795 i = tabsize - (j % tabsize);
6796 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006797 while (i--) {
6798 if (q >= qe)
6799 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 }
6803 }
6804 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006805 if (q >= qe)
6806 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006808 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 if (*p == '\n' || *p == '\r')
6810 j = 0;
6811 }
6812
6813 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006814
6815 overflow2:
6816 Py_DECREF(u);
6817 overflow1:
6818 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6819 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820}
6821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006822PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006823"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824\n\
6825Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006826such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827arguments start and end are interpreted as in slice notation.\n\
6828\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006829Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
6831static PyObject *
6832unicode_find(PyUnicodeObject *self, PyObject *args)
6833{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006834 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006835 Py_ssize_t start;
6836 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006837 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838
Christian Heimes9cd17752007-11-18 19:35:23 +00006839 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
Thomas Wouters477c8d52006-05-27 19:21:47 +00006842 result = stringlib_find_slice(
6843 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6844 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6845 start, end
6846 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
6848 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006849
Christian Heimes217cfd12007-12-02 14:31:20 +00006850 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851}
6852
6853static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006854unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855{
6856 if (index < 0 || index >= self->length) {
6857 PyErr_SetString(PyExc_IndexError, "string index out of range");
6858 return NULL;
6859 }
6860
6861 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6862}
6863
Guido van Rossumc2504932007-09-18 19:42:40 +00006864/* Believe it or not, this produces the same value for ASCII strings
6865 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006867unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
Guido van Rossumc2504932007-09-18 19:42:40 +00006869 Py_ssize_t len;
6870 Py_UNICODE *p;
6871 long x;
6872
6873 if (self->hash != -1)
6874 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006875 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006876 p = self->str;
6877 x = *p << 7;
6878 while (--len >= 0)
6879 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006880 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006881 if (x == -1)
6882 x = -2;
6883 self->hash = x;
6884 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885}
6886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006888"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
6892static PyObject *
6893unicode_index(PyUnicodeObject *self, PyObject *args)
6894{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006895 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006896 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006897 Py_ssize_t start;
6898 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899
Christian Heimes9cd17752007-11-18 19:35:23 +00006900 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902
Thomas Wouters477c8d52006-05-27 19:21:47 +00006903 result = stringlib_find_slice(
6904 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6905 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6906 start, end
6907 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
6909 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006910
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 if (result < 0) {
6912 PyErr_SetString(PyExc_ValueError, "substring not found");
6913 return NULL;
6914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006915
Christian Heimes217cfd12007-12-02 14:31:20 +00006916 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917}
6918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006919PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006923at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924
6925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006926unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927{
6928 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6929 register const Py_UNICODE *e;
6930 int cased;
6931
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 /* Shortcut for single character strings */
6933 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006936 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006937 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006939
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 e = p + PyUnicode_GET_SIZE(self);
6941 cased = 0;
6942 for (; p < e; p++) {
6943 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006946 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 else if (!cased && Py_UNICODE_ISLOWER(ch))
6948 cased = 1;
6949 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006950 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951}
6952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006953PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006954"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006956Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006957at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958
6959static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006960unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961{
6962 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6963 register const Py_UNICODE *e;
6964 int cased;
6965
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 /* Shortcut for single character strings */
6967 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006968 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006970 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006971 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006972 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006973
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 e = p + PyUnicode_GET_SIZE(self);
6975 cased = 0;
6976 for (; p < e; p++) {
6977 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006978
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006980 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 else if (!cased && Py_UNICODE_ISUPPER(ch))
6982 cased = 1;
6983 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006984 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985}
6986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006988"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006990Return True if S is a titlecased string and there is at least one\n\
6991character in S, i.e. upper- and titlecase characters may only\n\
6992follow uncased characters and lowercase characters only cased ones.\n\
6993Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
6995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006996unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
6998 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6999 register const Py_UNICODE *e;
7000 int cased, previous_is_cased;
7001
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 /* Shortcut for single character strings */
7003 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007004 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7005 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007007 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007008 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 e = p + PyUnicode_GET_SIZE(self);
7012 cased = 0;
7013 previous_is_cased = 0;
7014 for (; p < e; p++) {
7015 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007016
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7018 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007019 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 previous_is_cased = 1;
7021 cased = 1;
7022 }
7023 else if (Py_UNICODE_ISLOWER(ch)) {
7024 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007025 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 previous_is_cased = 1;
7027 cased = 1;
7028 }
7029 else
7030 previous_is_cased = 0;
7031 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007032 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033}
7034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007035PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007036"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007038Return True if all characters in S are whitespace\n\
7039and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040
7041static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007042unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043{
7044 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7045 register const Py_UNICODE *e;
7046
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 /* Shortcut for single character strings */
7048 if (PyUnicode_GET_SIZE(self) == 1 &&
7049 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007050 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007052 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007053 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007054 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007055
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 e = p + PyUnicode_GET_SIZE(self);
7057 for (; p < e; p++) {
7058 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007059 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007061 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007064PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007065"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007066\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007067Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007068and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007069
7070static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007071unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007072{
7073 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7074 register const Py_UNICODE *e;
7075
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007076 /* Shortcut for single character strings */
7077 if (PyUnicode_GET_SIZE(self) == 1 &&
7078 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007079 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007080
7081 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007082 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007083 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007084
7085 e = p + PyUnicode_GET_SIZE(self);
7086 for (; p < e; p++) {
7087 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007088 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007089 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007090 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007091}
7092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007093PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007094"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007095\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007096Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007097and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007098
7099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007100unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007101{
7102 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7103 register const Py_UNICODE *e;
7104
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007105 /* Shortcut for single character strings */
7106 if (PyUnicode_GET_SIZE(self) == 1 &&
7107 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007108 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007109
7110 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007111 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007112 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007113
7114 e = p + PyUnicode_GET_SIZE(self);
7115 for (; p < e; p++) {
7116 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007117 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007118 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007119 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007120}
7121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007122PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007123"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007125Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007126False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127
7128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007129unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130{
7131 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7132 register const Py_UNICODE *e;
7133
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 /* Shortcut for single character strings */
7135 if (PyUnicode_GET_SIZE(self) == 1 &&
7136 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007137 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007139 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007140 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007141 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007142
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 e = p + PyUnicode_GET_SIZE(self);
7144 for (; p < e; p++) {
7145 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007146 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007148 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149}
7150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007151PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007152"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007154Return True if all characters in S are digits\n\
7155and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
7157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007158unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159{
7160 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7161 register const Py_UNICODE *e;
7162
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 /* Shortcut for single character strings */
7164 if (PyUnicode_GET_SIZE(self) == 1 &&
7165 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007166 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007168 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007169 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007171
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 e = p + PyUnicode_GET_SIZE(self);
7173 for (; p < e; p++) {
7174 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007175 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007177 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178}
7179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007180PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007181"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007183Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007184False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185
7186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007187unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188{
7189 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7190 register const Py_UNICODE *e;
7191
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 /* Shortcut for single character strings */
7193 if (PyUnicode_GET_SIZE(self) == 1 &&
7194 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007195 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007197 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007198 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007199 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007200
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 e = p + PyUnicode_GET_SIZE(self);
7202 for (; p < e; p++) {
7203 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007204 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007206 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207}
7208
Martin v. Löwis47383402007-08-15 07:32:56 +00007209int
7210PyUnicode_IsIdentifier(PyObject *self)
7211{
7212 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7213 register const Py_UNICODE *e;
7214
7215 /* Special case for empty strings */
7216 if (PyUnicode_GET_SIZE(self) == 0)
7217 return 0;
7218
7219 /* PEP 3131 says that the first character must be in
7220 XID_Start and subsequent characters in XID_Continue,
7221 and for the ASCII range, the 2.x rules apply (i.e
7222 start with letters and underscore, continue with
7223 letters, digits, underscore). However, given the current
7224 definition of XID_Start and XID_Continue, it is sufficient
7225 to check just for these, except that _ must be allowed
7226 as starting an identifier. */
7227 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7228 return 0;
7229
7230 e = p + PyUnicode_GET_SIZE(self);
7231 for (p++; p < e; p++) {
7232 if (!_PyUnicode_IsXidContinue(*p))
7233 return 0;
7234 }
7235 return 1;
7236}
7237
7238PyDoc_STRVAR(isidentifier__doc__,
7239"S.isidentifier() -> bool\n\
7240\n\
7241Return True if S is a valid identifier according\n\
7242to the language definition.");
7243
7244static PyObject*
7245unicode_isidentifier(PyObject *self)
7246{
7247 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7248}
7249
Georg Brandl559e5d72008-06-11 18:37:52 +00007250PyDoc_STRVAR(isprintable__doc__,
7251"S.isprintable() -> bool\n\
7252\n\
7253Return True if all characters in S are considered\n\
7254printable in repr() or S is empty, False otherwise.");
7255
7256static PyObject*
7257unicode_isprintable(PyObject *self)
7258{
7259 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7260 register const Py_UNICODE *e;
7261
7262 /* Shortcut for single character strings */
7263 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7264 Py_RETURN_TRUE;
7265 }
7266
7267 e = p + PyUnicode_GET_SIZE(self);
7268 for (; p < e; p++) {
7269 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7270 Py_RETURN_FALSE;
7271 }
7272 }
7273 Py_RETURN_TRUE;
7274}
7275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007276PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007277"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278\n\
7279Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007280sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281
7282static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007283unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007285 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286}
7287
Martin v. Löwis18e16552006-02-15 17:27:45 +00007288static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289unicode_length(PyUnicodeObject *self)
7290{
7291 return self->length;
7292}
7293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007294PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007295"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296\n\
7297Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007298done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299
7300static PyObject *
7301unicode_ljust(PyUnicodeObject *self, PyObject *args)
7302{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007303 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007304 Py_UNICODE fillchar = ' ';
7305
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007306 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 return NULL;
7308
Tim Peters7a29bd52001-09-12 03:03:31 +00007309 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 Py_INCREF(self);
7311 return (PyObject*) self;
7312 }
7313
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007314 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315}
7316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007317PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007318"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007320Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321
7322static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007323unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325 return fixup(self, fixlower);
7326}
7327
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007328#define LEFTSTRIP 0
7329#define RIGHTSTRIP 1
7330#define BOTHSTRIP 2
7331
7332/* Arrays indexed by above */
7333static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7334
7335#define STRIPNAME(i) (stripformat[i]+3)
7336
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007337/* externally visible for str.strip(unicode) */
7338PyObject *
7339_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7340{
7341 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007342 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007343 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007344 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7345 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007346
Thomas Wouters477c8d52006-05-27 19:21:47 +00007347 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7348
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007349 i = 0;
7350 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007351 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7352 i++;
7353 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007354 }
7355
7356 j = len;
7357 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007358 do {
7359 j--;
7360 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7361 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007362 }
7363
7364 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007365 Py_INCREF(self);
7366 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007367 }
7368 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007369 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007370}
7371
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372
7373static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007374do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007376 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007377 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007378
7379 i = 0;
7380 if (striptype != RIGHTSTRIP) {
7381 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7382 i++;
7383 }
7384 }
7385
7386 j = len;
7387 if (striptype != LEFTSTRIP) {
7388 do {
7389 j--;
7390 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7391 j++;
7392 }
7393
7394 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7395 Py_INCREF(self);
7396 return (PyObject*)self;
7397 }
7398 else
7399 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400}
7401
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007402
7403static PyObject *
7404do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7405{
7406 PyObject *sep = NULL;
7407
7408 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7409 return NULL;
7410
7411 if (sep != NULL && sep != Py_None) {
7412 if (PyUnicode_Check(sep))
7413 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007414 else {
7415 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007416 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007417 STRIPNAME(striptype));
7418 return NULL;
7419 }
7420 }
7421
7422 return do_strip(self, striptype);
7423}
7424
7425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007426PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007427"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007428\n\
7429Return a copy of the string S with leading and trailing\n\
7430whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007431If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007432
7433static PyObject *
7434unicode_strip(PyUnicodeObject *self, PyObject *args)
7435{
7436 if (PyTuple_GET_SIZE(args) == 0)
7437 return do_strip(self, BOTHSTRIP); /* Common case */
7438 else
7439 return do_argstrip(self, BOTHSTRIP, args);
7440}
7441
7442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007443PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007444"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007445\n\
7446Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007447If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007448
7449static PyObject *
7450unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7451{
7452 if (PyTuple_GET_SIZE(args) == 0)
7453 return do_strip(self, LEFTSTRIP); /* Common case */
7454 else
7455 return do_argstrip(self, LEFTSTRIP, args);
7456}
7457
7458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007459PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007460"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007461\n\
7462Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007463If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007464
7465static PyObject *
7466unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7467{
7468 if (PyTuple_GET_SIZE(args) == 0)
7469 return do_strip(self, RIGHTSTRIP); /* Common case */
7470 else
7471 return do_argstrip(self, RIGHTSTRIP, args);
7472}
7473
7474
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007476unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477{
7478 PyUnicodeObject *u;
7479 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007480 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007481 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
7483 if (len < 0)
7484 len = 0;
7485
Tim Peters7a29bd52001-09-12 03:03:31 +00007486 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 /* no repeat, return original string */
7488 Py_INCREF(str);
7489 return (PyObject*) str;
7490 }
Tim Peters8f422462000-09-09 06:13:41 +00007491
7492 /* ensure # of chars needed doesn't overflow int and # of bytes
7493 * needed doesn't overflow size_t
7494 */
7495 nchars = len * str->length;
7496 if (len && nchars / len != str->length) {
7497 PyErr_SetString(PyExc_OverflowError,
7498 "repeated string is too long");
7499 return NULL;
7500 }
7501 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7502 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7503 PyErr_SetString(PyExc_OverflowError,
7504 "repeated string is too long");
7505 return NULL;
7506 }
7507 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 if (!u)
7509 return NULL;
7510
7511 p = u->str;
7512
Thomas Wouters477c8d52006-05-27 19:21:47 +00007513 if (str->length == 1 && len > 0) {
7514 Py_UNICODE_FILL(p, str->str[0], len);
7515 } else {
7516 Py_ssize_t done = 0; /* number of characters copied this far */
7517 if (done < nchars) {
7518 Py_UNICODE_COPY(p, str->str, str->length);
7519 done = str->length;
7520 }
7521 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007522 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007523 Py_UNICODE_COPY(p+done, p, n);
7524 done += n;
7525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 }
7527
7528 return (PyObject*) u;
7529}
7530
7531PyObject *PyUnicode_Replace(PyObject *obj,
7532 PyObject *subobj,
7533 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007534 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535{
7536 PyObject *self;
7537 PyObject *str1;
7538 PyObject *str2;
7539 PyObject *result;
7540
7541 self = PyUnicode_FromObject(obj);
7542 if (self == NULL)
7543 return NULL;
7544 str1 = PyUnicode_FromObject(subobj);
7545 if (str1 == NULL) {
7546 Py_DECREF(self);
7547 return NULL;
7548 }
7549 str2 = PyUnicode_FromObject(replobj);
7550 if (str2 == NULL) {
7551 Py_DECREF(self);
7552 Py_DECREF(str1);
7553 return NULL;
7554 }
Tim Petersced69f82003-09-16 20:30:58 +00007555 result = replace((PyUnicodeObject *)self,
7556 (PyUnicodeObject *)str1,
7557 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 maxcount);
7559 Py_DECREF(self);
7560 Py_DECREF(str1);
7561 Py_DECREF(str2);
7562 return result;
7563}
7564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007565PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007566"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567\n\
7568Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007569old replaced by new. If the optional argument count is\n\
7570given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
7572static PyObject*
7573unicode_replace(PyUnicodeObject *self, PyObject *args)
7574{
7575 PyUnicodeObject *str1;
7576 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007577 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 PyObject *result;
7579
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 return NULL;
7582 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7583 if (str1 == NULL)
7584 return NULL;
7585 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007586 if (str2 == NULL) {
7587 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
7591 result = replace(self, str1, str2, maxcount);
7592
7593 Py_DECREF(str1);
7594 Py_DECREF(str2);
7595 return result;
7596}
7597
7598static
7599PyObject *unicode_repr(PyObject *unicode)
7600{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007601 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007602 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007603 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7604 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7605
7606 /* XXX(nnorwitz): rather than over-allocating, it would be
7607 better to choose a different scheme. Perhaps scan the
7608 first N-chars of the string and allocate based on that size.
7609 */
7610 /* Initial allocation is based on the longest-possible unichr
7611 escape.
7612
7613 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7614 unichr, so in this case it's the longest unichr escape. In
7615 narrow (UTF-16) builds this is five chars per source unichr
7616 since there are two unichrs in the surrogate pair, so in narrow
7617 (UTF-16) builds it's not the longest unichr escape.
7618
7619 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7620 so in the narrow (UTF-16) build case it's the longest unichr
7621 escape.
7622 */
7623
Walter Dörwald1ab83302007-05-18 17:15:44 +00007624 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007625 2 /* quotes */
7626#ifdef Py_UNICODE_WIDE
7627 + 10*size
7628#else
7629 + 6*size
7630#endif
7631 + 1);
7632 if (repr == NULL)
7633 return NULL;
7634
Walter Dörwald1ab83302007-05-18 17:15:44 +00007635 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007636
7637 /* Add quote */
7638 *p++ = (findchar(s, size, '\'') &&
7639 !findchar(s, size, '"')) ? '"' : '\'';
7640 while (size-- > 0) {
7641 Py_UNICODE ch = *s++;
7642
7643 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007644 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007645 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007646 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007647 continue;
7648 }
7649
Georg Brandl559e5d72008-06-11 18:37:52 +00007650 /* Map special whitespace to '\t', \n', '\r' */
7651 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007652 *p++ = '\\';
7653 *p++ = 't';
7654 }
7655 else if (ch == '\n') {
7656 *p++ = '\\';
7657 *p++ = 'n';
7658 }
7659 else if (ch == '\r') {
7660 *p++ = '\\';
7661 *p++ = 'r';
7662 }
7663
7664 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007665 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007666 *p++ = '\\';
7667 *p++ = 'x';
7668 *p++ = hexdigits[(ch >> 4) & 0x000F];
7669 *p++ = hexdigits[ch & 0x000F];
7670 }
7671
Georg Brandl559e5d72008-06-11 18:37:52 +00007672 /* Copy ASCII characters as-is */
7673 else if (ch < 0x7F) {
7674 *p++ = ch;
7675 }
7676
7677 /* Non-ASCII characters */
7678 else {
7679 Py_UCS4 ucs = ch;
7680
7681#ifndef Py_UNICODE_WIDE
7682 Py_UNICODE ch2 = 0;
7683 /* Get code point from surrogate pair */
7684 if (size > 0) {
7685 ch2 = *s;
7686 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7687 && ch2 <= 0xDFFF) {
7688 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7689 + 0x00010000;
7690 s++;
7691 size--;
7692 }
7693 }
7694#endif
7695 /* Map Unicode whitespace and control characters
7696 (categories Z* and C* except ASCII space)
7697 */
7698 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7699 /* Map 8-bit characters to '\xhh' */
7700 if (ucs <= 0xff) {
7701 *p++ = '\\';
7702 *p++ = 'x';
7703 *p++ = hexdigits[(ch >> 4) & 0x000F];
7704 *p++ = hexdigits[ch & 0x000F];
7705 }
7706 /* Map 21-bit characters to '\U00xxxxxx' */
7707 else if (ucs >= 0x10000) {
7708 *p++ = '\\';
7709 *p++ = 'U';
7710 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7711 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7712 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7713 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7714 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7715 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7716 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7717 *p++ = hexdigits[ucs & 0x0000000F];
7718 }
7719 /* Map 16-bit characters to '\uxxxx' */
7720 else {
7721 *p++ = '\\';
7722 *p++ = 'u';
7723 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7724 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7725 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7726 *p++ = hexdigits[ucs & 0x000F];
7727 }
7728 }
7729 /* Copy characters as-is */
7730 else {
7731 *p++ = ch;
7732#ifndef Py_UNICODE_WIDE
7733 if (ucs >= 0x10000)
7734 *p++ = ch2;
7735#endif
7736 }
7737 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007738 }
7739 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007740 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007741
7742 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007743 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007744 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745}
7746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007747PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007748"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749\n\
7750Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007751such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752arguments start and end are interpreted as in slice notation.\n\
7753\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007754Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject *
7757unicode_rfind(PyUnicodeObject *self, PyObject *args)
7758{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007759 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007760 Py_ssize_t start;
7761 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007762 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763
Christian Heimes9cd17752007-11-18 19:35:23 +00007764 if (!_ParseTupleFinds(args, &substring, &start, &end))
7765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766
Thomas Wouters477c8d52006-05-27 19:21:47 +00007767 result = stringlib_rfind_slice(
7768 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7769 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7770 start, end
7771 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772
7773 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007774
Christian Heimes217cfd12007-12-02 14:31:20 +00007775 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776}
7777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007778PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007779"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007781Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782
7783static PyObject *
7784unicode_rindex(PyUnicodeObject *self, PyObject *args)
7785{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007786 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007787 Py_ssize_t start;
7788 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007789 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790
Christian Heimes9cd17752007-11-18 19:35:23 +00007791 if (!_ParseTupleFinds(args, &substring, &start, &end))
7792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793
Thomas Wouters477c8d52006-05-27 19:21:47 +00007794 result = stringlib_rfind_slice(
7795 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7796 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7797 start, end
7798 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799
7800 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007801
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 if (result < 0) {
7803 PyErr_SetString(PyExc_ValueError, "substring not found");
7804 return NULL;
7805 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007806 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807}
7808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007809PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007810"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007812Return S right justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007813done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814
7815static PyObject *
7816unicode_rjust(PyUnicodeObject *self, PyObject *args)
7817{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007818 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007819 Py_UNICODE fillchar = ' ';
7820
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007821 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 return NULL;
7823
Tim Peters7a29bd52001-09-12 03:03:31 +00007824 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 Py_INCREF(self);
7826 return (PyObject*) self;
7827 }
7828
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007829 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830}
7831
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832PyObject *PyUnicode_Split(PyObject *s,
7833 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007834 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835{
7836 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007837
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 s = PyUnicode_FromObject(s);
7839 if (s == NULL)
7840 return NULL;
7841 if (sep != NULL) {
7842 sep = PyUnicode_FromObject(sep);
7843 if (sep == NULL) {
7844 Py_DECREF(s);
7845 return NULL;
7846 }
7847 }
7848
7849 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7850
7851 Py_DECREF(s);
7852 Py_XDECREF(sep);
7853 return result;
7854}
7855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007856PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007857"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858\n\
7859Return a list of the words in S, using sep as the\n\
7860delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007861splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007862whitespace string is a separator and empty strings are\n\
7863removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864
7865static PyObject*
7866unicode_split(PyUnicodeObject *self, PyObject *args)
7867{
7868 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870
Martin v. Löwis18e16552006-02-15 17:27:45 +00007871 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 return NULL;
7873
7874 if (substring == Py_None)
7875 return split(self, NULL, maxcount);
7876 else if (PyUnicode_Check(substring))
7877 return split(self, (PyUnicodeObject *)substring, maxcount);
7878 else
7879 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7880}
7881
Thomas Wouters477c8d52006-05-27 19:21:47 +00007882PyObject *
7883PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7884{
7885 PyObject* str_obj;
7886 PyObject* sep_obj;
7887 PyObject* out;
7888
7889 str_obj = PyUnicode_FromObject(str_in);
7890 if (!str_obj)
7891 return NULL;
7892 sep_obj = PyUnicode_FromObject(sep_in);
7893 if (!sep_obj) {
7894 Py_DECREF(str_obj);
7895 return NULL;
7896 }
7897
7898 out = stringlib_partition(
7899 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7900 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7901 );
7902
7903 Py_DECREF(sep_obj);
7904 Py_DECREF(str_obj);
7905
7906 return out;
7907}
7908
7909
7910PyObject *
7911PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7912{
7913 PyObject* str_obj;
7914 PyObject* sep_obj;
7915 PyObject* out;
7916
7917 str_obj = PyUnicode_FromObject(str_in);
7918 if (!str_obj)
7919 return NULL;
7920 sep_obj = PyUnicode_FromObject(sep_in);
7921 if (!sep_obj) {
7922 Py_DECREF(str_obj);
7923 return NULL;
7924 }
7925
7926 out = stringlib_rpartition(
7927 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7928 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7929 );
7930
7931 Py_DECREF(sep_obj);
7932 Py_DECREF(str_obj);
7933
7934 return out;
7935}
7936
7937PyDoc_STRVAR(partition__doc__,
7938"S.partition(sep) -> (head, sep, tail)\n\
7939\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007940Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007941the separator itself, and the part after it. If the separator is not\n\
7942found, returns S and two empty strings.");
7943
7944static PyObject*
7945unicode_partition(PyUnicodeObject *self, PyObject *separator)
7946{
7947 return PyUnicode_Partition((PyObject *)self, separator);
7948}
7949
7950PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007951"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007952\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007953Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007954the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007955separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007956
7957static PyObject*
7958unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7959{
7960 return PyUnicode_RPartition((PyObject *)self, separator);
7961}
7962
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007963PyObject *PyUnicode_RSplit(PyObject *s,
7964 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007965 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007966{
7967 PyObject *result;
7968
7969 s = PyUnicode_FromObject(s);
7970 if (s == NULL)
7971 return NULL;
7972 if (sep != NULL) {
7973 sep = PyUnicode_FromObject(sep);
7974 if (sep == NULL) {
7975 Py_DECREF(s);
7976 return NULL;
7977 }
7978 }
7979
7980 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7981
7982 Py_DECREF(s);
7983 Py_XDECREF(sep);
7984 return result;
7985}
7986
7987PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007988"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007989\n\
7990Return a list of the words in S, using sep as the\n\
7991delimiter string, starting at the end of the string and\n\
7992working to the front. If maxsplit is given, at most maxsplit\n\
7993splits are done. If sep is not specified, any whitespace string\n\
7994is a separator.");
7995
7996static PyObject*
7997unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7998{
7999 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008000 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008001
Martin v. Löwis18e16552006-02-15 17:27:45 +00008002 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008003 return NULL;
8004
8005 if (substring == Py_None)
8006 return rsplit(self, NULL, maxcount);
8007 else if (PyUnicode_Check(substring))
8008 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8009 else
8010 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8011}
8012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008013PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00008014"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015\n\
8016Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008017Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008018is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
8020static PyObject*
8021unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8022{
Guido van Rossum86662912000-04-11 15:38:46 +00008023 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024
Guido van Rossum86662912000-04-11 15:38:46 +00008025 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 return NULL;
8027
Guido van Rossum86662912000-04-11 15:38:46 +00008028 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029}
8030
8031static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008032PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033{
Walter Dörwald346737f2007-05-31 10:44:43 +00008034 if (PyUnicode_CheckExact(self)) {
8035 Py_INCREF(self);
8036 return self;
8037 } else
8038 /* Subtype -- return genuine unicode string with the same value. */
8039 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8040 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041}
8042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008043PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008044"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045\n\
8046Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008047and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048
8049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008050unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 return fixup(self, fixswapcase);
8053}
8054
Georg Brandlceee0772007-11-27 23:48:05 +00008055PyDoc_STRVAR(maketrans__doc__,
8056"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8057\n\
8058Return a translation table usable for str.translate().\n\
8059If there is only one argument, it must be a dictionary mapping Unicode\n\
8060ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008061Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008062If there are two arguments, they must be strings of equal length, and\n\
8063in the resulting dictionary, each character in x will be mapped to the\n\
8064character at the same position in y. If there is a third argument, it\n\
8065must be a string, whose characters will be mapped to None in the result.");
8066
8067static PyObject*
8068unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8069{
8070 PyObject *x, *y = NULL, *z = NULL;
8071 PyObject *new = NULL, *key, *value;
8072 Py_ssize_t i = 0;
8073 int res;
8074
8075 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8076 return NULL;
8077 new = PyDict_New();
8078 if (!new)
8079 return NULL;
8080 if (y != NULL) {
8081 /* x must be a string too, of equal length */
8082 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8083 if (!PyUnicode_Check(x)) {
8084 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8085 "be a string if there is a second argument");
8086 goto err;
8087 }
8088 if (PyUnicode_GET_SIZE(x) != ylen) {
8089 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8090 "arguments must have equal length");
8091 goto err;
8092 }
8093 /* create entries for translating chars in x to those in y */
8094 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008095 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8096 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008097 if (!key || !value)
8098 goto err;
8099 res = PyDict_SetItem(new, key, value);
8100 Py_DECREF(key);
8101 Py_DECREF(value);
8102 if (res < 0)
8103 goto err;
8104 }
8105 /* create entries for deleting chars in z */
8106 if (z != NULL) {
8107 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008108 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008109 if (!key)
8110 goto err;
8111 res = PyDict_SetItem(new, key, Py_None);
8112 Py_DECREF(key);
8113 if (res < 0)
8114 goto err;
8115 }
8116 }
8117 } else {
8118 /* x must be a dict */
8119 if (!PyDict_Check(x)) {
8120 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8121 "to maketrans it must be a dict");
8122 goto err;
8123 }
8124 /* copy entries into the new dict, converting string keys to int keys */
8125 while (PyDict_Next(x, &i, &key, &value)) {
8126 if (PyUnicode_Check(key)) {
8127 /* convert string keys to integer keys */
8128 PyObject *newkey;
8129 if (PyUnicode_GET_SIZE(key) != 1) {
8130 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8131 "table must be of length 1");
8132 goto err;
8133 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008134 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008135 if (!newkey)
8136 goto err;
8137 res = PyDict_SetItem(new, newkey, value);
8138 Py_DECREF(newkey);
8139 if (res < 0)
8140 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008141 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008142 /* just keep integer keys */
8143 if (PyDict_SetItem(new, key, value) < 0)
8144 goto err;
8145 } else {
8146 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8147 "be strings or integers");
8148 goto err;
8149 }
8150 }
8151 }
8152 return new;
8153 err:
8154 Py_DECREF(new);
8155 return NULL;
8156}
8157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008158PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008159"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160\n\
8161Return a copy of the string S, where all characters have been mapped\n\
8162through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008163Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008164Unmapped characters are left untouched. Characters mapped to None\n\
8165are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166
8167static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008168unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169{
Georg Brandlceee0772007-11-27 23:48:05 +00008170 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171}
8172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008173PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008174"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008176Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
8178static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008179unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 return fixup(self, fixupper);
8182}
8183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008184PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008185"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186\n\
8187Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008188of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189
8190static PyObject *
8191unicode_zfill(PyUnicodeObject *self, PyObject *args)
8192{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008193 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 PyUnicodeObject *u;
8195
Martin v. Löwis18e16552006-02-15 17:27:45 +00008196 Py_ssize_t width;
8197 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 return NULL;
8199
8200 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008201 if (PyUnicode_CheckExact(self)) {
8202 Py_INCREF(self);
8203 return (PyObject*) self;
8204 }
8205 else
8206 return PyUnicode_FromUnicode(
8207 PyUnicode_AS_UNICODE(self),
8208 PyUnicode_GET_SIZE(self)
8209 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 }
8211
8212 fill = width - self->length;
8213
8214 u = pad(self, fill, 0, '0');
8215
Walter Dörwald068325e2002-04-15 13:36:47 +00008216 if (u == NULL)
8217 return NULL;
8218
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 if (u->str[fill] == '+' || u->str[fill] == '-') {
8220 /* move sign to beginning of string */
8221 u->str[0] = u->str[fill];
8222 u->str[fill] = '0';
8223 }
8224
8225 return (PyObject*) u;
8226}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227
8228#if 0
8229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008230unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231{
Christian Heimes2202f872008-02-06 14:31:34 +00008232 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233}
8234#endif
8235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008236PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008237"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008239Return True if S starts with the specified prefix, False otherwise.\n\
8240With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008241With optional end, stop comparing S at that position.\n\
8242prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243
8244static PyObject *
8245unicode_startswith(PyUnicodeObject *self,
8246 PyObject *args)
8247{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008248 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008250 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008251 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008252 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008254 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008255 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008257 if (PyTuple_Check(subobj)) {
8258 Py_ssize_t i;
8259 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8260 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8261 PyTuple_GET_ITEM(subobj, i));
8262 if (substring == NULL)
8263 return NULL;
8264 result = tailmatch(self, substring, start, end, -1);
8265 Py_DECREF(substring);
8266 if (result) {
8267 Py_RETURN_TRUE;
8268 }
8269 }
8270 /* nothing matched */
8271 Py_RETURN_FALSE;
8272 }
8273 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008275 return NULL;
8276 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008278 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279}
8280
8281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008282PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008283"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008285Return True if S ends with the specified suffix, False otherwise.\n\
8286With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008287With optional end, stop comparing S at that position.\n\
8288suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289
8290static PyObject *
8291unicode_endswith(PyUnicodeObject *self,
8292 PyObject *args)
8293{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008294 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008296 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008297 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008298 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008300 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8301 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008303 if (PyTuple_Check(subobj)) {
8304 Py_ssize_t i;
8305 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8306 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8307 PyTuple_GET_ITEM(subobj, i));
8308 if (substring == NULL)
8309 return NULL;
8310 result = tailmatch(self, substring, start, end, +1);
8311 Py_DECREF(substring);
8312 if (result) {
8313 Py_RETURN_TRUE;
8314 }
8315 }
8316 Py_RETURN_FALSE;
8317 }
8318 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008322 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008324 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325}
8326
Eric Smith8c663262007-08-25 02:26:07 +00008327#include "stringlib/string_format.h"
8328
8329PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008330"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008331\n\
8332");
8333
Eric Smith4a7d76d2008-05-30 18:10:19 +00008334static PyObject *
8335unicode__format__(PyObject* self, PyObject* args)
8336{
8337 PyObject *format_spec;
8338
8339 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8340 return NULL;
8341
8342 return _PyUnicode_FormatAdvanced(self,
8343 PyUnicode_AS_UNICODE(format_spec),
8344 PyUnicode_GET_SIZE(format_spec));
8345}
8346
Eric Smith8c663262007-08-25 02:26:07 +00008347PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008348"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008349\n\
8350");
8351
8352static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008353unicode__sizeof__(PyUnicodeObject *v)
8354{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008355 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8356 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008357}
8358
8359PyDoc_STRVAR(sizeof__doc__,
8360"S.__sizeof__() -> size of S in memory, in bytes");
8361
8362static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008363unicode_getnewargs(PyUnicodeObject *v)
8364{
8365 return Py_BuildValue("(u#)", v->str, v->length);
8366}
8367
8368
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369static PyMethodDef unicode_methods[] = {
8370
8371 /* Order is according to common usage: often used methods should
8372 appear first, since lookup is done sequentially. */
8373
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008374 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8375 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8376 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008377 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008378 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8379 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8380 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8381 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8382 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8383 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8384 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008385 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008386 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8387 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8388 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008389 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008390 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8391 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8392 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008393 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008394 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008395 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008396 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008397 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8398 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8399 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8400 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8401 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8402 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8403 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8404 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8405 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8406 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8407 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8408 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8409 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8410 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008411 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008412 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008413 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008414 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008415 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008416 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8417 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008418 {"maketrans", (PyCFunction) unicode_maketrans,
8419 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008420 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008421#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008422 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423#endif
8424
8425#if 0
8426 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008427 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428#endif
8429
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008430 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 {NULL, NULL}
8432};
8433
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008434static PyObject *
8435unicode_mod(PyObject *v, PyObject *w)
8436{
8437 if (!PyUnicode_Check(v)) {
8438 Py_INCREF(Py_NotImplemented);
8439 return Py_NotImplemented;
8440 }
8441 return PyUnicode_Format(v, w);
8442}
8443
8444static PyNumberMethods unicode_as_number = {
8445 0, /*nb_add*/
8446 0, /*nb_subtract*/
8447 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008448 unicode_mod, /*nb_remainder*/
8449};
8450
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008452 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008453 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008454 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8455 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008456 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 0, /* sq_ass_item */
8458 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008459 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460};
8461
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008462static PyObject*
8463unicode_subscript(PyUnicodeObject* self, PyObject* item)
8464{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008465 if (PyIndex_Check(item)) {
8466 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008467 if (i == -1 && PyErr_Occurred())
8468 return NULL;
8469 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008470 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008471 return unicode_getitem(self, i);
8472 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008473 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008474 Py_UNICODE* source_buf;
8475 Py_UNICODE* result_buf;
8476 PyObject* result;
8477
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008478 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008479 &start, &stop, &step, &slicelength) < 0) {
8480 return NULL;
8481 }
8482
8483 if (slicelength <= 0) {
8484 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008485 } else if (start == 0 && step == 1 && slicelength == self->length &&
8486 PyUnicode_CheckExact(self)) {
8487 Py_INCREF(self);
8488 return (PyObject *)self;
8489 } else if (step == 1) {
8490 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008491 } else {
8492 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008493 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8494 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008495
8496 if (result_buf == NULL)
8497 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008498
8499 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8500 result_buf[i] = source_buf[cur];
8501 }
Tim Petersced69f82003-09-16 20:30:58 +00008502
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008503 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008504 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008505 return result;
8506 }
8507 } else {
8508 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8509 return NULL;
8510 }
8511}
8512
8513static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008514 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008515 (binaryfunc)unicode_subscript, /* mp_subscript */
8516 (objobjargproc)0, /* mp_ass_subscript */
8517};
8518
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520/* Helpers for PyUnicode_Format() */
8521
8522static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008523getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008525 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 if (argidx < arglen) {
8527 (*p_argidx)++;
8528 if (arglen < 0)
8529 return args;
8530 else
8531 return PyTuple_GetItem(args, argidx);
8532 }
8533 PyErr_SetString(PyExc_TypeError,
8534 "not enough arguments for format string");
8535 return NULL;
8536}
8537
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008539strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008541 register Py_ssize_t i;
8542 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 for (i = len - 1; i >= 0; i--)
8544 buffer[i] = (Py_UNICODE) charbuffer[i];
8545
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546 return len;
8547}
8548
Neal Norwitzfc76d632006-01-10 06:03:13 +00008549static int
8550doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8551{
Tim Peters15231542006-02-16 01:08:01 +00008552 Py_ssize_t result;
8553
Neal Norwitzfc76d632006-01-10 06:03:13 +00008554 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008555 result = strtounicode(buffer, (char *)buffer);
8556 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008557}
8558
Christian Heimes3fd13992008-03-21 01:05:49 +00008559#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008560static int
8561longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8562{
Tim Peters15231542006-02-16 01:08:01 +00008563 Py_ssize_t result;
8564
Neal Norwitzfc76d632006-01-10 06:03:13 +00008565 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008566 result = strtounicode(buffer, (char *)buffer);
8567 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008568}
Christian Heimes3fd13992008-03-21 01:05:49 +00008569#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008570
Guido van Rossum078151d2002-08-11 04:24:12 +00008571/* XXX To save some code duplication, formatfloat/long/int could have been
8572 shared with stringobject.c, converting from 8-bit to Unicode after the
8573 formatting is done. */
8574
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575static int
8576formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008577 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 int flags,
8579 int prec,
8580 int type,
8581 PyObject *v)
8582{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008583 /* fmt = '%#.' + `prec` + `type`
8584 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 char fmt[20];
8586 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008587
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 x = PyFloat_AsDouble(v);
8589 if (x == -1.0 && PyErr_Occurred())
8590 return -1;
8591 if (prec < 0)
8592 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008593 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8594 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008595 /* Worst case length calc to ensure no buffer overrun:
8596
8597 'g' formats:
8598 fmt = %#.<prec>g
8599 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8600 for any double rep.)
8601 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8602
8603 'f' formats:
8604 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8605 len = 1 + 50 + 1 + prec = 52 + prec
8606
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008607 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008608 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008609
8610 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008611 if (((type == 'g' || type == 'G') &&
8612 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008613 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008614 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008615 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008616 return -1;
8617 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008618 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8619 (flags&F_ALT) ? "#" : "",
8620 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008621 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622}
8623
Tim Peters38fd5b62000-09-21 05:43:11 +00008624static PyObject*
8625formatlong(PyObject *val, int flags, int prec, int type)
8626{
8627 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008628 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008629 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008630 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008631
Christian Heimes72b710a2008-05-26 13:28:38 +00008632 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008633 if (!str)
8634 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008635 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008636 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008637 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008638}
8639
Christian Heimes3fd13992008-03-21 01:05:49 +00008640#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641static int
8642formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008643 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 int flags,
8645 int prec,
8646 int type,
8647 PyObject *v)
8648{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008649 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008650 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8651 * + 1 + 1
8652 * = 24
8653 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008654 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008655 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 long x;
8657
Christian Heimes217cfd12007-12-02 14:31:20 +00008658 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008660 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008661 if (x < 0 && type == 'u') {
8662 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008663 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008664 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8665 sign = "-";
8666 else
8667 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008669 prec = 1;
8670
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008671 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8672 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008673 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008674 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008675 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008676 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008677 return -1;
8678 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008679
8680 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008681 (type == 'x' || type == 'X' || type == 'o')) {
8682 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008683 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008684 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008685 * - when 0 is being converted, the C standard leaves off
8686 * the '0x' or '0X', which is inconsistent with other
8687 * %#x/%#X conversions and inconsistent with Python's
8688 * hex() function
8689 * - there are platforms that violate the standard and
8690 * convert 0 with the '0x' or '0X'
8691 * (Metrowerks, Compaq Tru64)
8692 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008693 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008694 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008695 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008696 * We can achieve the desired consistency by inserting our
8697 * own '0x' or '0X' prefix, and substituting %x/%X in place
8698 * of %#x/%#X.
8699 *
8700 * Note that this is the same approach as used in
8701 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008702 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008703 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8704 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008705 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008706 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008707 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8708 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008709 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008710 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008711 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008712 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008713 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008714 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715}
Christian Heimes3fd13992008-03-21 01:05:49 +00008716#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717
8718static int
8719formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008720 size_t buflen,
8721 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008723 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008724 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008725 if (PyUnicode_GET_SIZE(v) == 1) {
8726 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8727 buf[1] = '\0';
8728 return 1;
8729 }
8730#ifndef Py_UNICODE_WIDE
8731 if (PyUnicode_GET_SIZE(v) == 2) {
8732 /* Decode a valid surrogate pair */
8733 int c0 = PyUnicode_AS_UNICODE(v)[0];
8734 int c1 = PyUnicode_AS_UNICODE(v)[1];
8735 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8736 0xDC00 <= c1 && c1 <= 0xDFFF) {
8737 buf[0] = c0;
8738 buf[1] = c1;
8739 buf[2] = '\0';
8740 return 2;
8741 }
8742 }
8743#endif
8744 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 else {
8747 /* Integer input truncated to a character */
8748 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008749 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008751 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008752
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008753 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008754 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008755 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008756 return -1;
8757 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008758
8759#ifndef Py_UNICODE_WIDE
8760 if (x > 0xffff) {
8761 x -= 0x10000;
8762 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8763 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8764 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008765 }
8766#endif
8767 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008768 buf[1] = '\0';
8769 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008771
8772 onError:
8773 PyErr_SetString(PyExc_TypeError,
8774 "%c requires int or char");
8775 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776}
8777
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008778/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8779
8780 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8781 chars are formatted. XXX This is a magic number. Each formatting
8782 routine does bounds checking to ensure no overflow, but a better
8783 solution may be to malloc a buffer of appropriate size for each
8784 format. For now, the current solution is sufficient.
8785*/
8786#define FORMATBUFLEN (size_t)120
8787
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788PyObject *PyUnicode_Format(PyObject *format,
8789 PyObject *args)
8790{
8791 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008792 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 int args_owned = 0;
8794 PyUnicodeObject *result = NULL;
8795 PyObject *dict = NULL;
8796 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008797
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 if (format == NULL || args == NULL) {
8799 PyErr_BadInternalCall();
8800 return NULL;
8801 }
8802 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008803 if (uformat == NULL)
8804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 fmt = PyUnicode_AS_UNICODE(uformat);
8806 fmtcnt = PyUnicode_GET_SIZE(uformat);
8807
8808 reslen = rescnt = fmtcnt + 100;
8809 result = _PyUnicode_New(reslen);
8810 if (result == NULL)
8811 goto onError;
8812 res = PyUnicode_AS_UNICODE(result);
8813
8814 if (PyTuple_Check(args)) {
8815 arglen = PyTuple_Size(args);
8816 argidx = 0;
8817 }
8818 else {
8819 arglen = -1;
8820 argidx = -2;
8821 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008822 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008823 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824 dict = args;
8825
8826 while (--fmtcnt >= 0) {
8827 if (*fmt != '%') {
8828 if (--rescnt < 0) {
8829 rescnt = fmtcnt + 100;
8830 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008831 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008832 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8834 --rescnt;
8835 }
8836 *res++ = *fmt++;
8837 }
8838 else {
8839 /* Got a format specifier */
8840 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008841 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 Py_UNICODE c = '\0';
8844 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008845 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 PyObject *v = NULL;
8847 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008848 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008850 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008851 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852
8853 fmt++;
8854 if (*fmt == '(') {
8855 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008856 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 PyObject *key;
8858 int pcount = 1;
8859
8860 if (dict == NULL) {
8861 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008862 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 goto onError;
8864 }
8865 ++fmt;
8866 --fmtcnt;
8867 keystart = fmt;
8868 /* Skip over balanced parentheses */
8869 while (pcount > 0 && --fmtcnt >= 0) {
8870 if (*fmt == ')')
8871 --pcount;
8872 else if (*fmt == '(')
8873 ++pcount;
8874 fmt++;
8875 }
8876 keylen = fmt - keystart - 1;
8877 if (fmtcnt < 0 || pcount > 0) {
8878 PyErr_SetString(PyExc_ValueError,
8879 "incomplete format key");
8880 goto onError;
8881 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008882#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008883 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 then looked up since Python uses strings to hold
8885 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008886 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 key = PyUnicode_EncodeUTF8(keystart,
8888 keylen,
8889 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008890#else
8891 key = PyUnicode_FromUnicode(keystart, keylen);
8892#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 if (key == NULL)
8894 goto onError;
8895 if (args_owned) {
8896 Py_DECREF(args);
8897 args_owned = 0;
8898 }
8899 args = PyObject_GetItem(dict, key);
8900 Py_DECREF(key);
8901 if (args == NULL) {
8902 goto onError;
8903 }
8904 args_owned = 1;
8905 arglen = -1;
8906 argidx = -2;
8907 }
8908 while (--fmtcnt >= 0) {
8909 switch (c = *fmt++) {
8910 case '-': flags |= F_LJUST; continue;
8911 case '+': flags |= F_SIGN; continue;
8912 case ' ': flags |= F_BLANK; continue;
8913 case '#': flags |= F_ALT; continue;
8914 case '0': flags |= F_ZERO; continue;
8915 }
8916 break;
8917 }
8918 if (c == '*') {
8919 v = getnextarg(args, arglen, &argidx);
8920 if (v == NULL)
8921 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008922 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 PyErr_SetString(PyExc_TypeError,
8924 "* wants int");
8925 goto onError;
8926 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008927 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008928 if (width == -1 && PyErr_Occurred())
8929 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 if (width < 0) {
8931 flags |= F_LJUST;
8932 width = -width;
8933 }
8934 if (--fmtcnt >= 0)
8935 c = *fmt++;
8936 }
8937 else if (c >= '0' && c <= '9') {
8938 width = c - '0';
8939 while (--fmtcnt >= 0) {
8940 c = *fmt++;
8941 if (c < '0' || c > '9')
8942 break;
8943 if ((width*10) / 10 != width) {
8944 PyErr_SetString(PyExc_ValueError,
8945 "width too big");
8946 goto onError;
8947 }
8948 width = width*10 + (c - '0');
8949 }
8950 }
8951 if (c == '.') {
8952 prec = 0;
8953 if (--fmtcnt >= 0)
8954 c = *fmt++;
8955 if (c == '*') {
8956 v = getnextarg(args, arglen, &argidx);
8957 if (v == NULL)
8958 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008959 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960 PyErr_SetString(PyExc_TypeError,
8961 "* wants int");
8962 goto onError;
8963 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008964 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008965 if (prec == -1 && PyErr_Occurred())
8966 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 if (prec < 0)
8968 prec = 0;
8969 if (--fmtcnt >= 0)
8970 c = *fmt++;
8971 }
8972 else if (c >= '0' && c <= '9') {
8973 prec = c - '0';
8974 while (--fmtcnt >= 0) {
8975 c = Py_CHARMASK(*fmt++);
8976 if (c < '0' || c > '9')
8977 break;
8978 if ((prec*10) / 10 != prec) {
8979 PyErr_SetString(PyExc_ValueError,
8980 "prec too big");
8981 goto onError;
8982 }
8983 prec = prec*10 + (c - '0');
8984 }
8985 }
8986 } /* prec */
8987 if (fmtcnt >= 0) {
8988 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 if (--fmtcnt >= 0)
8990 c = *fmt++;
8991 }
8992 }
8993 if (fmtcnt < 0) {
8994 PyErr_SetString(PyExc_ValueError,
8995 "incomplete format");
8996 goto onError;
8997 }
8998 if (c != '%') {
8999 v = getnextarg(args, arglen, &argidx);
9000 if (v == NULL)
9001 goto onError;
9002 }
9003 sign = 0;
9004 fill = ' ';
9005 switch (c) {
9006
9007 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009008 pbuf = formatbuf;
9009 /* presume that buffer length is at least 1 */
9010 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 len = 1;
9012 break;
9013
9014 case 's':
9015 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009016 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 if (PyUnicode_Check(v) && c == 's') {
9018 temp = v;
9019 Py_INCREF(temp);
9020 }
9021 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009023 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009024 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009026 else
9027 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 if (temp == NULL)
9029 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009030 if (PyUnicode_Check(temp))
9031 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009032 else {
9033 Py_DECREF(temp);
9034 PyErr_SetString(PyExc_TypeError,
9035 "%s argument has non-string str()");
9036 goto onError;
9037 }
9038 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009039 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 len = PyUnicode_GET_SIZE(temp);
9041 if (prec >= 0 && len > prec)
9042 len = prec;
9043 break;
9044
9045 case 'i':
9046 case 'd':
9047 case 'u':
9048 case 'o':
9049 case 'x':
9050 case 'X':
9051 if (c == 'i')
9052 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009053 isnumok = 0;
9054 if (PyNumber_Check(v)) {
9055 PyObject *iobj=NULL;
9056
9057 if (PyLong_Check(v)) {
9058 iobj = v;
9059 Py_INCREF(iobj);
9060 }
9061 else {
9062 iobj = PyNumber_Long(v);
9063 }
9064 if (iobj!=NULL) {
9065 if (PyLong_Check(iobj)) {
9066 isnumok = 1;
9067 temp = formatlong(iobj, flags, prec, c);
9068 Py_DECREF(iobj);
9069 if (!temp)
9070 goto onError;
9071 pbuf = PyUnicode_AS_UNICODE(temp);
9072 len = PyUnicode_GET_SIZE(temp);
9073 sign = 1;
9074 }
9075 else {
9076 Py_DECREF(iobj);
9077 }
9078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009080 if (!isnumok) {
9081 PyErr_Format(PyExc_TypeError,
9082 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009083 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009084 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009085 }
9086 if (flags & F_ZERO)
9087 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 break;
9089
9090 case 'e':
9091 case 'E':
9092 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009093 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 case 'g':
9095 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009096 if (c == 'F')
9097 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009098 pbuf = formatbuf;
9099 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9100 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 if (len < 0)
9102 goto onError;
9103 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009104 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105 fill = '0';
9106 break;
9107
9108 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009109 pbuf = formatbuf;
9110 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 if (len < 0)
9112 goto onError;
9113 break;
9114
9115 default:
9116 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009117 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009118 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009119 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009120 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009121 (Py_ssize_t)(fmt - 1 -
9122 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 goto onError;
9124 }
9125 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009126 if (*pbuf == '-' || *pbuf == '+') {
9127 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128 len--;
9129 }
9130 else if (flags & F_SIGN)
9131 sign = '+';
9132 else if (flags & F_BLANK)
9133 sign = ' ';
9134 else
9135 sign = 0;
9136 }
9137 if (width < len)
9138 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009139 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 reslen -= rescnt;
9141 rescnt = width + fmtcnt + 100;
9142 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009143 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009144 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009145 PyErr_NoMemory();
9146 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009147 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009148 if (_PyUnicode_Resize(&result, reslen) < 0) {
9149 Py_XDECREF(temp);
9150 goto onError;
9151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 res = PyUnicode_AS_UNICODE(result)
9153 + reslen - rescnt;
9154 }
9155 if (sign) {
9156 if (fill != ' ')
9157 *res++ = sign;
9158 rescnt--;
9159 if (width > len)
9160 width--;
9161 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009162 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009163 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009164 assert(pbuf[1] == c);
9165 if (fill != ' ') {
9166 *res++ = *pbuf++;
9167 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009168 }
Tim Petersfff53252001-04-12 18:38:48 +00009169 rescnt -= 2;
9170 width -= 2;
9171 if (width < 0)
9172 width = 0;
9173 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175 if (width > len && !(flags & F_LJUST)) {
9176 do {
9177 --rescnt;
9178 *res++ = fill;
9179 } while (--width > len);
9180 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009181 if (fill == ' ') {
9182 if (sign)
9183 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009184 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009185 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009186 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009187 *res++ = *pbuf++;
9188 *res++ = *pbuf++;
9189 }
9190 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009191 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192 res += len;
9193 rescnt -= len;
9194 while (--width >= len) {
9195 --rescnt;
9196 *res++ = ' ';
9197 }
9198 if (dict && (argidx < arglen) && c != '%') {
9199 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009200 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009201 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202 goto onError;
9203 }
9204 Py_XDECREF(temp);
9205 } /* '%' */
9206 } /* until end */
9207 if (argidx < arglen && !dict) {
9208 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009209 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210 goto onError;
9211 }
9212
Thomas Woutersa96affe2006-03-12 00:29:36 +00009213 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9214 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215 if (args_owned) {
9216 Py_DECREF(args);
9217 }
9218 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 return (PyObject *)result;
9220
9221 onError:
9222 Py_XDECREF(result);
9223 Py_DECREF(uformat);
9224 if (args_owned) {
9225 Py_DECREF(args);
9226 }
9227 return NULL;
9228}
9229
Jeremy Hylton938ace62002-07-17 16:30:39 +00009230static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009231unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9232
Tim Peters6d6c1a32001-08-02 04:15:00 +00009233static PyObject *
9234unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9235{
9236 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009237 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009238 char *encoding = NULL;
9239 char *errors = NULL;
9240
Guido van Rossume023fe02001-08-30 03:12:59 +00009241 if (type != &PyUnicode_Type)
9242 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009243 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009244 kwlist, &x, &encoding, &errors))
9245 return NULL;
9246 if (x == NULL)
9247 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009248 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009249 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009250 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009251 return PyUnicode_FromEncodedObject(x, encoding, errors);
9252}
9253
Guido van Rossume023fe02001-08-30 03:12:59 +00009254static PyObject *
9255unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9256{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009257 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009258 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009259
9260 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9261 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9262 if (tmp == NULL)
9263 return NULL;
9264 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009265 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009266 if (pnew == NULL) {
9267 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009268 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009269 }
Christian Heimesb186d002008-03-18 15:15:01 +00009270 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009271 if (pnew->str == NULL) {
9272 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009273 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009274 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009275 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009276 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009277 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9278 pnew->length = n;
9279 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009280 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009281 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009282}
9283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009284PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009285"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009286\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009287Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009288encoding defaults to the current default string encoding.\n\
9289errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009290
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009291static PyObject *unicode_iter(PyObject *seq);
9292
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009294 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009295 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296 sizeof(PyUnicodeObject), /* tp_size */
9297 0, /* tp_itemsize */
9298 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009299 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009301 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009303 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009304 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009305 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009307 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308 (hashfunc) unicode_hash, /* tp_hash*/
9309 0, /* tp_call*/
9310 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009311 PyObject_GenericGetAttr, /* tp_getattro */
9312 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009313 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009314 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9315 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009316 unicode_doc, /* tp_doc */
9317 0, /* tp_traverse */
9318 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009319 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009320 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009321 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009322 0, /* tp_iternext */
9323 unicode_methods, /* tp_methods */
9324 0, /* tp_members */
9325 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009326 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009327 0, /* tp_dict */
9328 0, /* tp_descr_get */
9329 0, /* tp_descr_set */
9330 0, /* tp_dictoffset */
9331 0, /* tp_init */
9332 0, /* tp_alloc */
9333 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009334 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335};
9336
9337/* Initialize the Unicode implementation */
9338
Thomas Wouters78890102000-07-22 19:25:51 +00009339void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009341 int i;
9342
Thomas Wouters477c8d52006-05-27 19:21:47 +00009343 /* XXX - move this array to unicodectype.c ? */
9344 Py_UNICODE linebreak[] = {
9345 0x000A, /* LINE FEED */
9346 0x000D, /* CARRIAGE RETURN */
9347 0x001C, /* FILE SEPARATOR */
9348 0x001D, /* GROUP SEPARATOR */
9349 0x001E, /* RECORD SEPARATOR */
9350 0x0085, /* NEXT LINE */
9351 0x2028, /* LINE SEPARATOR */
9352 0x2029, /* PARAGRAPH SEPARATOR */
9353 };
9354
Fred Drakee4315f52000-05-09 19:53:39 +00009355 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009356 free_list = NULL;
9357 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009359 if (!unicode_empty)
9360 return;
9361
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009362 for (i = 0; i < 256; i++)
9363 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009364 if (PyType_Ready(&PyUnicode_Type) < 0)
9365 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009366
9367 /* initialize the linebreak bloom filter */
9368 bloom_linebreak = make_bloom_mask(
9369 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9370 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009371
9372 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373}
9374
9375/* Finalize the Unicode implementation */
9376
Christian Heimesa156e092008-02-16 07:38:31 +00009377int
9378PyUnicode_ClearFreeList(void)
9379{
9380 int freelist_size = numfree;
9381 PyUnicodeObject *u;
9382
9383 for (u = free_list; u != NULL;) {
9384 PyUnicodeObject *v = u;
9385 u = *(PyUnicodeObject **)u;
9386 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009387 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009388 Py_XDECREF(v->defenc);
9389 PyObject_Del(v);
9390 numfree--;
9391 }
9392 free_list = NULL;
9393 assert(numfree == 0);
9394 return freelist_size;
9395}
9396
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397void
Thomas Wouters78890102000-07-22 19:25:51 +00009398_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009400 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009402 Py_XDECREF(unicode_empty);
9403 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009404
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009405 for (i = 0; i < 256; i++) {
9406 if (unicode_latin1[i]) {
9407 Py_DECREF(unicode_latin1[i]);
9408 unicode_latin1[i] = NULL;
9409 }
9410 }
Christian Heimesa156e092008-02-16 07:38:31 +00009411 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009413
Walter Dörwald16807132007-05-25 13:52:07 +00009414void
9415PyUnicode_InternInPlace(PyObject **p)
9416{
9417 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9418 PyObject *t;
9419 if (s == NULL || !PyUnicode_Check(s))
9420 Py_FatalError(
9421 "PyUnicode_InternInPlace: unicode strings only please!");
9422 /* If it's a subclass, we don't really know what putting
9423 it in the interned dict might do. */
9424 if (!PyUnicode_CheckExact(s))
9425 return;
9426 if (PyUnicode_CHECK_INTERNED(s))
9427 return;
9428 if (interned == NULL) {
9429 interned = PyDict_New();
9430 if (interned == NULL) {
9431 PyErr_Clear(); /* Don't leave an exception */
9432 return;
9433 }
9434 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009435 /* It might be that the GetItem call fails even
9436 though the key is present in the dictionary,
9437 namely when this happens during a stack overflow. */
9438 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009439 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009440 Py_END_ALLOW_RECURSION
9441
Walter Dörwald16807132007-05-25 13:52:07 +00009442 if (t) {
9443 Py_INCREF(t);
9444 Py_DECREF(*p);
9445 *p = t;
9446 return;
9447 }
9448
Martin v. Löwis5b222132007-06-10 09:51:05 +00009449 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009450 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9451 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009452 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009453 return;
9454 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009455 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009456 /* The two references in interned are not counted by refcnt.
9457 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009458 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009459 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9460}
9461
9462void
9463PyUnicode_InternImmortal(PyObject **p)
9464{
9465 PyUnicode_InternInPlace(p);
9466 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9467 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9468 Py_INCREF(*p);
9469 }
9470}
9471
9472PyObject *
9473PyUnicode_InternFromString(const char *cp)
9474{
9475 PyObject *s = PyUnicode_FromString(cp);
9476 if (s == NULL)
9477 return NULL;
9478 PyUnicode_InternInPlace(&s);
9479 return s;
9480}
9481
9482void _Py_ReleaseInternedUnicodeStrings(void)
9483{
9484 PyObject *keys;
9485 PyUnicodeObject *s;
9486 Py_ssize_t i, n;
9487 Py_ssize_t immortal_size = 0, mortal_size = 0;
9488
9489 if (interned == NULL || !PyDict_Check(interned))
9490 return;
9491 keys = PyDict_Keys(interned);
9492 if (keys == NULL || !PyList_Check(keys)) {
9493 PyErr_Clear();
9494 return;
9495 }
9496
9497 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9498 detector, interned unicode strings are not forcibly deallocated;
9499 rather, we give them their stolen references back, and then clear
9500 and DECREF the interned dict. */
9501
9502 n = PyList_GET_SIZE(keys);
9503 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9504 n);
9505 for (i = 0; i < n; i++) {
9506 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9507 switch (s->state) {
9508 case SSTATE_NOT_INTERNED:
9509 /* XXX Shouldn't happen */
9510 break;
9511 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009512 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009513 immortal_size += s->length;
9514 break;
9515 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009516 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009517 mortal_size += s->length;
9518 break;
9519 default:
9520 Py_FatalError("Inconsistent interned string state.");
9521 }
9522 s->state = SSTATE_NOT_INTERNED;
9523 }
9524 fprintf(stderr, "total size of all interned strings: "
9525 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9526 "mortal/immortal\n", mortal_size, immortal_size);
9527 Py_DECREF(keys);
9528 PyDict_Clear(interned);
9529 Py_DECREF(interned);
9530 interned = NULL;
9531}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009532
9533
9534/********************* Unicode Iterator **************************/
9535
9536typedef struct {
9537 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009538 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009539 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9540} unicodeiterobject;
9541
9542static void
9543unicodeiter_dealloc(unicodeiterobject *it)
9544{
9545 _PyObject_GC_UNTRACK(it);
9546 Py_XDECREF(it->it_seq);
9547 PyObject_GC_Del(it);
9548}
9549
9550static int
9551unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9552{
9553 Py_VISIT(it->it_seq);
9554 return 0;
9555}
9556
9557static PyObject *
9558unicodeiter_next(unicodeiterobject *it)
9559{
9560 PyUnicodeObject *seq;
9561 PyObject *item;
9562
9563 assert(it != NULL);
9564 seq = it->it_seq;
9565 if (seq == NULL)
9566 return NULL;
9567 assert(PyUnicode_Check(seq));
9568
9569 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009570 item = PyUnicode_FromUnicode(
9571 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009572 if (item != NULL)
9573 ++it->it_index;
9574 return item;
9575 }
9576
9577 Py_DECREF(seq);
9578 it->it_seq = NULL;
9579 return NULL;
9580}
9581
9582static PyObject *
9583unicodeiter_len(unicodeiterobject *it)
9584{
9585 Py_ssize_t len = 0;
9586 if (it->it_seq)
9587 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009588 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009589}
9590
9591PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9592
9593static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009594 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9595 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009596 {NULL, NULL} /* sentinel */
9597};
9598
9599PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009600 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009601 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009602 sizeof(unicodeiterobject), /* tp_basicsize */
9603 0, /* tp_itemsize */
9604 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009605 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009606 0, /* tp_print */
9607 0, /* tp_getattr */
9608 0, /* tp_setattr */
9609 0, /* tp_compare */
9610 0, /* tp_repr */
9611 0, /* tp_as_number */
9612 0, /* tp_as_sequence */
9613 0, /* tp_as_mapping */
9614 0, /* tp_hash */
9615 0, /* tp_call */
9616 0, /* tp_str */
9617 PyObject_GenericGetAttr, /* tp_getattro */
9618 0, /* tp_setattro */
9619 0, /* tp_as_buffer */
9620 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9621 0, /* tp_doc */
9622 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9623 0, /* tp_clear */
9624 0, /* tp_richcompare */
9625 0, /* tp_weaklistoffset */
9626 PyObject_SelfIter, /* tp_iter */
9627 (iternextfunc)unicodeiter_next, /* tp_iternext */
9628 unicodeiter_methods, /* tp_methods */
9629 0,
9630};
9631
9632static PyObject *
9633unicode_iter(PyObject *seq)
9634{
9635 unicodeiterobject *it;
9636
9637 if (!PyUnicode_Check(seq)) {
9638 PyErr_BadInternalCall();
9639 return NULL;
9640 }
9641 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9642 if (it == NULL)
9643 return NULL;
9644 it->it_index = 0;
9645 Py_INCREF(seq);
9646 it->it_seq = (PyUnicodeObject *)seq;
9647 _PyObject_GC_TRACK(it);
9648 return (PyObject *)it;
9649}
9650
Martin v. Löwis5b222132007-06-10 09:51:05 +00009651size_t
9652Py_UNICODE_strlen(const Py_UNICODE *u)
9653{
9654 int res = 0;
9655 while(*u++)
9656 res++;
9657 return res;
9658}
9659
9660Py_UNICODE*
9661Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9662{
9663 Py_UNICODE *u = s1;
9664 while ((*u++ = *s2++));
9665 return s1;
9666}
9667
9668Py_UNICODE*
9669Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9670{
9671 Py_UNICODE *u = s1;
9672 while ((*u++ = *s2++))
9673 if (n-- == 0)
9674 break;
9675 return s1;
9676}
9677
9678int
9679Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9680{
9681 while (*s1 && *s2 && *s1 == *s2)
9682 s1++, s2++;
9683 if (*s1 && *s2)
9684 return (*s1 < *s2) ? -1 : +1;
9685 if (*s1)
9686 return 1;
9687 if (*s2)
9688 return -1;
9689 return 0;
9690}
9691
9692Py_UNICODE*
9693Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9694{
9695 const Py_UNICODE *p;
9696 for (p = s; *p; p++)
9697 if (*p == c)
9698 return (Py_UNICODE*)p;
9699 return NULL;
9700}
9701
9702
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009703#ifdef __cplusplus
9704}
9705#endif
9706
9707
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009708/*
9709Local variables:
9710c-basic-offset: 4
9711indent-tabs-mode: nil
9712End:
9713*/