blob: 897e390a3ad120e870fa180de15cf26980998afa [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
129// case 0x0009: /* HORIZONTAL TABULATION */
130// case 0x000A: /* LINE FEED */
131// case 0x000B: /* VERTICAL TABULATION */
132// case 0x000C: /* FORM FEED */
133// case 0x000D: /* CARRIAGE RETURN */
134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136// case 0x001C: /* FILE SEPARATOR */
137// case 0x001D: /* GROUP SEPARATOR */
138// case 0x001E: /* RECORD SEPARATOR */
139// case 0x001F: /* UNIT SEPARATOR */
140 0, 0, 0, 0, 1, 1, 1, 1,
141// case 0x0020: /* SPACE */
142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
160// 0x000A, /* LINE FEED */
161// 0x000D, /* CARRIAGE RETURN */
162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164// 0x001C, /* FILE SEPARATOR */
165// 0x001D, /* GROUP SEPARATOR */
166// 0x001E, /* RECORD SEPARATOR */
167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
314 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000315 if (free_list) {
316 unicode = free_list;
317 free_list = *(PyUnicodeObject **)unicode;
318 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000320 /* Keep-Alive optimization: we only upsize the buffer,
321 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000322 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000323 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000324 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000325 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000328 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000331 }
332 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 }
334 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000335 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000336 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 if (unicode == NULL)
338 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000339 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 }
342
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000343 if (!unicode->str) {
344 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000345 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000346 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000348 * the caller fails before initializing str -- unicode_resize()
349 * reads str[0], and the Keep-Alive optimization can keep memory
350 * allocated for str alive across a call to unicode_dealloc(unicode).
351 * We don't want unicode_resize to read uninitialized memory in
352 * that case.
353 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000354 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000358 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000359 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000361
362 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000363 /* XXX UNREF/NEWREF interface should be more symmetrical */
364 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000365 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000366 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368}
369
370static
Guido van Rossum9475a232001-10-05 20:51:39 +0000371void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372{
Walter Dörwald16807132007-05-25 13:52:07 +0000373 switch (PyUnicode_CHECK_INTERNED(unicode)) {
374 case SSTATE_NOT_INTERNED:
375 break;
376
377 case SSTATE_INTERNED_MORTAL:
378 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000379 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000380 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
381 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000382 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000383 break;
384
385 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000386 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000387
388 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000389 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000390 }
391
Guido van Rossum604ddf82001-12-06 20:03:56 +0000392 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000393 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000394 /* Keep-Alive optimization */
395 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000396 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 unicode->str = NULL;
398 unicode->length = 0;
399 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000400 if (unicode->defenc) {
401 Py_DECREF(unicode->defenc);
402 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000403 }
404 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000405 *(PyUnicodeObject **)unicode = free_list;
406 free_list = unicode;
407 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000410 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000411 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000412 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414}
415
Martin v. Löwis18e16552006-02-15 17:27:45 +0000416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000417{
418 register PyUnicodeObject *v;
419
420 /* Argument checks */
421 if (unicode == NULL) {
422 PyErr_BadInternalCall();
423 return -1;
424 }
425 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000426 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000427 PyErr_BadInternalCall();
428 return -1;
429 }
430
431 /* Resizing unicode_empty and single character objects is not
432 possible since these are being shared. We simply return a fresh
433 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000434 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 (v == unicode_empty || v->length == 1)) {
436 PyUnicodeObject *w = _PyUnicode_New(length);
437 if (w == NULL)
438 return -1;
439 Py_UNICODE_COPY(w->str, v->str,
440 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000441 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 *unicode = (PyObject *)w;
443 return 0;
444 }
445
446 /* Note that we don't have to modify *unicode for unshared Unicode
447 objects, since we can modify them in-place. */
448 return unicode_resize(v, length);
449}
450
451/* Internal API for use in unicodeobject.c only ! */
452#define _PyUnicode_Resize(unicodevar, length) \
453 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000456 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457{
458 PyUnicodeObject *unicode;
459
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 /* If the Unicode data is known at construction time, we can apply
461 some optimizations which share commonly used objects. */
462 if (u != NULL) {
463
464 /* Optimization for empty strings */
465 if (size == 0 && unicode_empty != NULL) {
466 Py_INCREF(unicode_empty);
467 return (PyObject *)unicode_empty;
468 }
469
470 /* Single character Unicode objects in the Latin-1 range are
471 shared when using this constructor */
472 if (size == 1 && *u < 256) {
473 unicode = unicode_latin1[*u];
474 if (!unicode) {
475 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476 if (!unicode)
477 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000478 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479 unicode_latin1[*u] = unicode;
480 }
481 Py_INCREF(unicode);
482 return (PyObject *)unicode;
483 }
484 }
Tim Petersced69f82003-09-16 20:30:58 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 unicode = _PyUnicode_New(size);
487 if (!unicode)
488 return NULL;
489
490 /* Copy the Unicode data into the new object */
491 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000492 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493
494 return (PyObject *)unicode;
495}
496
Walter Dörwaldd2034312007-05-18 16:29:38 +0000497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000498{
499 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000500
501 if (size < 0) {
502 PyErr_SetString(PyExc_SystemError,
503 "Negative size passed to PyUnicode_FromStringAndSize");
504 return NULL;
505 }
506
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000507 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000508 some optimizations which share commonly used objects.
509 Also, this means the input must be UTF-8, so fall back to the
510 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000511 if (u != NULL) {
512
513 /* Optimization for empty strings */
514 if (size == 0 && unicode_empty != NULL) {
515 Py_INCREF(unicode_empty);
516 return (PyObject *)unicode_empty;
517 }
518
Martin v. Löwis9c121062007-08-05 20:26:11 +0000519 /* Single characters are shared when using this constructor.
520 Restrict to ASCII, since the input must be UTF-8. */
521 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000522 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 if (!unicode) {
524 unicode = _PyUnicode_New(1);
525 if (!unicode)
526 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000527 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000528 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529 }
530 Py_INCREF(unicode);
531 return (PyObject *)unicode;
532 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000533
534 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000535 }
536
Walter Dörwald55507312007-05-18 13:12:10 +0000537 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 if (!unicode)
539 return NULL;
540
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000541 return (PyObject *)unicode;
542}
543
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544PyObject *PyUnicode_FromString(const char *u)
545{
546 size_t size = strlen(u);
547 if (size > PY_SSIZE_T_MAX) {
548 PyErr_SetString(PyExc_OverflowError, "input too long");
549 return NULL;
550 }
551
552 return PyUnicode_FromStringAndSize(u, size);
553}
554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555#ifdef HAVE_WCHAR_H
556
557PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000558 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559{
560 PyUnicodeObject *unicode;
561
562 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000563 if (size == 0)
564 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 PyErr_BadInternalCall();
566 return NULL;
567 }
568
Martin v. Löwis790465f2008-04-05 20:41:37 +0000569 if (size == -1) {
570 size = wcslen(w);
571 }
572
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 unicode = _PyUnicode_New(size);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578#ifdef HAVE_USABLE_WCHAR_T
579 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000580#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 {
582 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000583 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000585 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 *u++ = *w++;
587 }
588#endif
589
590 return (PyObject *)unicode;
591}
592
Walter Dörwald346737f2007-05-31 10:44:43 +0000593static void
594makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
595{
596 *fmt++ = '%';
597 if (width) {
598 if (zeropad)
599 *fmt++ = '0';
600 fmt += sprintf(fmt, "%d", width);
601 }
602 if (precision)
603 fmt += sprintf(fmt, ".%d", precision);
604 if (longflag)
605 *fmt++ = 'l';
606 else if (size_tflag) {
607 char *f = PY_FORMAT_SIZE_T;
608 while (*f)
609 *fmt++ = *f++;
610 }
611 *fmt++ = c;
612 *fmt = '\0';
613}
614
Walter Dörwaldd2034312007-05-18 16:29:38 +0000615#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
616
617PyObject *
618PyUnicode_FromFormatV(const char *format, va_list vargs)
619{
620 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000621 Py_ssize_t callcount = 0;
622 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000623 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000624 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000625 int width = 0;
626 int precision = 0;
627 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000628 const char* f;
629 Py_UNICODE *s;
630 PyObject *string;
631 /* used by sprintf */
632 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000633 /* use abuffer instead of buffer, if we need more space
634 * (which can happen if there's a format specifier with width). */
635 char *abuffer = NULL;
636 char *realbuffer;
637 Py_ssize_t abuffersize = 0;
638 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000639 const char *copy;
640
641#ifdef VA_LIST_IS_ARRAY
642 Py_MEMCPY(count, vargs, sizeof(va_list));
643#else
644#ifdef __va_copy
645 __va_copy(count, vargs);
646#else
647 count = vargs;
648#endif
649#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000650 /* step 1: count the number of %S/%R/%A format specifications
651 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
652 * these objects once during step 3 and put the result in
653 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000654 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000655 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000656 ++callcount;
657 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000658 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000659 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000660 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000661 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000662 if (!callresults) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 callresult = callresults;
667 }
668 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000669 for (f = format; *f; f++) {
670 if (*f == '%') {
671 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000672 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000673 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000674 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000675 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000676 ;
677
678 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
679 * they don't affect the amount of space we reserve.
680 */
681 if ((*f == 'l' || *f == 'z') &&
682 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000683 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000684
685 switch (*f) {
686 case 'c':
687 (void)va_arg(count, int);
688 /* fall through... */
689 case '%':
690 n++;
691 break;
692 case 'd': case 'u': case 'i': case 'x':
693 (void) va_arg(count, int);
694 /* 20 bytes is enough to hold a 64-bit
695 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000696 This isn't enough for octal.
697 If a width is specified we need more
698 (which we allocate later). */
699 if (width < 20)
700 width = 20;
701 n += width;
702 if (abuffersize < width)
703 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000704 break;
705 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000706 {
707 /* UTF-8 */
708 unsigned char*s;
709 s = va_arg(count, unsigned char*);
710 while (*s) {
711 if (*s < 128) {
712 n++; s++;
713 } else if (*s < 0xc0) {
714 /* invalid UTF-8 */
715 n++; s++;
716 } else if (*s < 0xc0) {
717 n++;
718 s++; if(!*s)break;
719 s++;
720 } else if (*s < 0xe0) {
721 n++;
722 s++; if(!*s)break;
723 s++; if(!*s)break;
724 s++;
725 } else {
726 #ifdef Py_UNICODE_WIDE
727 n++;
728 #else
729 n+=2;
730 #endif
731 s++; if(!*s)break;
732 s++; if(!*s)break;
733 s++; if(!*s)break;
734 s++;
735 }
736 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000737 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000738 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000739 case 'U':
740 {
741 PyObject *obj = va_arg(count, PyObject *);
742 assert(obj && PyUnicode_Check(obj));
743 n += PyUnicode_GET_SIZE(obj);
744 break;
745 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000746 case 'V':
747 {
748 PyObject *obj = va_arg(count, PyObject *);
749 const char *str = va_arg(count, const char *);
750 assert(obj || str);
751 assert(!obj || PyUnicode_Check(obj));
752 if (obj)
753 n += PyUnicode_GET_SIZE(obj);
754 else
755 n += strlen(str);
756 break;
757 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000758 case 'S':
759 {
760 PyObject *obj = va_arg(count, PyObject *);
761 PyObject *str;
762 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000763 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000764 if (!str)
765 goto fail;
766 n += PyUnicode_GET_SIZE(str);
767 /* Remember the str and switch to the next slot */
768 *callresult++ = str;
769 break;
770 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000771 case 'R':
772 {
773 PyObject *obj = va_arg(count, PyObject *);
774 PyObject *repr;
775 assert(obj);
776 repr = PyObject_Repr(obj);
777 if (!repr)
778 goto fail;
779 n += PyUnicode_GET_SIZE(repr);
780 /* Remember the repr and switch to the next slot */
781 *callresult++ = repr;
782 break;
783 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000784 case 'A':
785 {
786 PyObject *obj = va_arg(count, PyObject *);
787 PyObject *ascii;
788 assert(obj);
789 ascii = PyObject_ASCII(obj);
790 if (!ascii)
791 goto fail;
792 n += PyUnicode_GET_SIZE(ascii);
793 /* Remember the repr and switch to the next slot */
794 *callresult++ = ascii;
795 break;
796 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 case 'p':
798 (void) va_arg(count, int);
799 /* maximum 64-bit pointer representation:
800 * 0xffffffffffffffff
801 * so 19 characters is enough.
802 * XXX I count 18 -- what's the extra for?
803 */
804 n += 19;
805 break;
806 default:
807 /* if we stumble upon an unknown
808 formatting code, copy the rest of
809 the format string to the output
810 string. (we cannot just skip the
811 code, since there's no way to know
812 what's in the argument list) */
813 n += strlen(p);
814 goto expand;
815 }
816 } else
817 n++;
818 }
819 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000820 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000821 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000822 if (!abuffer) {
823 PyErr_NoMemory();
824 goto fail;
825 }
826 realbuffer = abuffer;
827 }
828 else
829 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000830 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000831 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000832 we don't have to resize the string.
833 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000834 string = PyUnicode_FromUnicode(NULL, n);
835 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000836 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000837
838 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000839 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000840
841 for (f = format; *f; f++) {
842 if (*f == '%') {
843 const char* p = f++;
844 int longflag = 0;
845 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000846 zeropad = (*f == '0');
847 /* parse the width.precision part */
848 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000849 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000850 width = (width*10) + *f++ - '0';
851 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000852 if (*f == '.') {
853 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000854 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000855 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000856 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000857 /* handle the long flag, but only for %ld and %lu.
858 others can be added when necessary. */
859 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
860 longflag = 1;
861 ++f;
862 }
863 /* handle the size_t flag. */
864 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
865 size_tflag = 1;
866 ++f;
867 }
868
869 switch (*f) {
870 case 'c':
871 *s++ = va_arg(vargs, int);
872 break;
873 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000874 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000875 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000876 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000877 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000878 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000879 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000880 sprintf(realbuffer, fmt, va_arg(vargs, int));
881 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 break;
883 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000884 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000885 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000886 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000888 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000889 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000890 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
891 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 break;
893 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000894 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
895 sprintf(realbuffer, fmt, va_arg(vargs, int));
896 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 break;
898 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000899 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
900 sprintf(realbuffer, fmt, va_arg(vargs, int));
901 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000902 break;
903 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000904 {
905 /* Parameter must be UTF-8 encoded.
906 In case of encoding errors, use
907 the replacement character. */
908 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000909 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000910 u = PyUnicode_DecodeUTF8(p, strlen(p),
911 "replace");
912 if (!u)
913 goto fail;
914 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
915 PyUnicode_GET_SIZE(u));
916 s += PyUnicode_GET_SIZE(u);
917 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000918 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000919 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000920 case 'U':
921 {
922 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000923 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
924 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
925 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000926 break;
927 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000928 case 'V':
929 {
930 PyObject *obj = va_arg(vargs, PyObject *);
931 const char *str = va_arg(vargs, const char *);
932 if (obj) {
933 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935 s += size;
936 } else {
937 appendstring(str);
938 }
939 break;
940 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000941 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000942 case 'R':
943 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000944 Py_UNICODE *ucopy;
945 Py_ssize_t usize;
946 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000947 /* unused, since we already have the result */
948 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000949 ucopy = PyUnicode_AS_UNICODE(*callresult);
950 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000951 for (upos = 0; upos<usize;)
952 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000953 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000954 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000955 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000956 ++callresult;
957 break;
958 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000959 case 'p':
960 sprintf(buffer, "%p", va_arg(vargs, void*));
961 /* %p is ill-defined: ensure leading 0x. */
962 if (buffer[1] == 'X')
963 buffer[1] = 'x';
964 else if (buffer[1] != 'x') {
965 memmove(buffer+2, buffer, strlen(buffer)+1);
966 buffer[0] = '0';
967 buffer[1] = 'x';
968 }
969 appendstring(buffer);
970 break;
971 case '%':
972 *s++ = '%';
973 break;
974 default:
975 appendstring(p);
976 goto end;
977 }
978 } else
979 *s++ = *f;
980 }
981
982 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000983 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000984 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000985 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000986 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000987 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
988 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000989 fail:
990 if (callresults) {
991 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000992 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000993 Py_DECREF(*callresult2);
994 ++callresult2;
995 }
Christian Heimesb186d002008-03-18 15:15:01 +0000996 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000997 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000998 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000999 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001000 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001001}
1002
1003#undef appendstring
1004
1005PyObject *
1006PyUnicode_FromFormat(const char *format, ...)
1007{
1008 PyObject* ret;
1009 va_list vargs;
1010
1011#ifdef HAVE_STDARG_PROTOTYPES
1012 va_start(vargs, format);
1013#else
1014 va_start(vargs);
1015#endif
1016 ret = PyUnicode_FromFormatV(format, vargs);
1017 va_end(vargs);
1018 return ret;
1019}
1020
Martin v. Löwis18e16552006-02-15 17:27:45 +00001021Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1022 wchar_t *w,
1023 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024{
1025 if (unicode == NULL) {
1026 PyErr_BadInternalCall();
1027 return -1;
1028 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001029
1030 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001032 size = PyUnicode_GET_SIZE(unicode) + 1;
1033
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034#ifdef HAVE_USABLE_WCHAR_T
1035 memcpy(w, unicode->str, size * sizeof(wchar_t));
1036#else
1037 {
1038 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001039 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001041 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 *w++ = *u++;
1043 }
1044#endif
1045
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001046 if (size > PyUnicode_GET_SIZE(unicode))
1047 return PyUnicode_GET_SIZE(unicode);
1048 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 return size;
1050}
1051
1052#endif
1053
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001054PyObject *PyUnicode_FromOrdinal(int ordinal)
1055{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001056 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001057
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001058 if (ordinal < 0 || ordinal > 0x10ffff) {
1059 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001060 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061 return NULL;
1062 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001063
1064#ifndef Py_UNICODE_WIDE
1065 if (ordinal > 0xffff) {
1066 ordinal -= 0x10000;
1067 s[0] = 0xD800 | (ordinal >> 10);
1068 s[1] = 0xDC00 | (ordinal & 0x3FF);
1069 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001070 }
1071#endif
1072
Hye-Shik Chang40574832004-04-06 07:24:51 +00001073 s[0] = (Py_UNICODE)ordinal;
1074 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001075}
1076
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077PyObject *PyUnicode_FromObject(register PyObject *obj)
1078{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001080 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001081 if (PyUnicode_CheckExact(obj)) {
1082 Py_INCREF(obj);
1083 return obj;
1084 }
1085 if (PyUnicode_Check(obj)) {
1086 /* For a Unicode subtype that's not a Unicode object,
1087 return a true Unicode object with the same data. */
1088 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1089 PyUnicode_GET_SIZE(obj));
1090 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001091 PyErr_Format(PyExc_TypeError,
1092 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001093 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001094 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095}
1096
1097PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1098 const char *encoding,
1099 const char *errors)
1100{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001101 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001102 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001103 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001104
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 if (obj == NULL) {
1106 PyErr_BadInternalCall();
1107 return NULL;
1108 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001110 if (PyUnicode_Check(obj)) {
1111 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001112 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001113 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001114 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001115
1116 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001117 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001118 s = PyBytes_AS_STRING(obj);
1119 len = PyBytes_GET_SIZE(obj);
1120 }
1121 else if (PyByteArray_Check(obj)) {
1122 s = PyByteArray_AS_STRING(obj);
1123 len = PyByteArray_GET_SIZE(obj);
1124 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001125 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1126 /* Overwrite the error message with something more useful in
1127 case of a TypeError. */
1128 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001129 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001130 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001131 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001132 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001133 goto onError;
1134 }
Tim Petersced69f82003-09-16 20:30:58 +00001135
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001136 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137 if (len == 0) {
1138 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001139 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 }
Tim Petersced69f82003-09-16 20:30:58 +00001141 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001142 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001143
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001144 return v;
1145
1146 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148}
1149
1150PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001151 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152 const char *encoding,
1153 const char *errors)
1154{
1155 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001156 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001157 char lower[20]; /* Enough for any encoding name we recognize */
1158 char *l;
1159 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001160
1161 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001162 encoding = PyUnicode_GetDefaultEncoding();
1163
1164 /* Convert encoding to lower case and replace '_' with '-' in order to
1165 catch e.g. UTF_8 */
1166 e = encoding;
1167 l = lower;
1168 while (*e && l < &lower[(sizeof lower) - 2]) {
1169 if (ISUPPER(*e)) {
1170 *l++ = TOLOWER(*e++);
1171 }
1172 else if (*e == '_') {
1173 *l++ = '-';
1174 e++;
1175 }
1176 else {
1177 *l++ = *e++;
1178 }
1179 }
1180 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001181
1182 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001183 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001185 else if ((strcmp(lower, "latin-1") == 0) ||
1186 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001187 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001188#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001189 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001190 return PyUnicode_DecodeMBCS(s, size, errors);
1191#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001192 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001193 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001194 else if (strcmp(lower, "utf-16") == 0)
1195 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1196 else if (strcmp(lower, "utf-32") == 0)
1197 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198
1199 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001200 buffer = NULL;
1201 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1202 goto onError;
1203 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 if (buffer == NULL)
1205 goto onError;
1206 unicode = PyCodec_Decode(buffer, encoding, errors);
1207 if (unicode == NULL)
1208 goto onError;
1209 if (!PyUnicode_Check(unicode)) {
1210 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001211 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001212 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 Py_DECREF(unicode);
1214 goto onError;
1215 }
1216 Py_DECREF(buffer);
1217 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001218
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 onError:
1220 Py_XDECREF(buffer);
1221 return NULL;
1222}
1223
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225 const char *encoding,
1226 const char *errors)
1227{
1228 PyObject *v;
1229
1230 if (!PyUnicode_Check(unicode)) {
1231 PyErr_BadArgument();
1232 goto onError;
1233 }
1234
1235 if (encoding == NULL)
1236 encoding = PyUnicode_GetDefaultEncoding();
1237
1238 /* Decode via the codec registry */
1239 v = PyCodec_Decode(unicode, encoding, errors);
1240 if (v == NULL)
1241 goto onError;
1242 return v;
1243
1244 onError:
1245 return NULL;
1246}
1247
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001248PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1249 const char *encoding,
1250 const char *errors)
1251{
1252 PyObject *v;
1253
1254 if (!PyUnicode_Check(unicode)) {
1255 PyErr_BadArgument();
1256 goto onError;
1257 }
1258
1259 if (encoding == NULL)
1260 encoding = PyUnicode_GetDefaultEncoding();
1261
1262 /* Decode via the codec registry */
1263 v = PyCodec_Decode(unicode, encoding, errors);
1264 if (v == NULL)
1265 goto onError;
1266 if (!PyUnicode_Check(v)) {
1267 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001268 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001269 Py_TYPE(v)->tp_name);
1270 Py_DECREF(v);
1271 goto onError;
1272 }
1273 return v;
1274
1275 onError:
1276 return NULL;
1277}
1278
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 const char *encoding,
1282 const char *errors)
1283{
1284 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001285
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 unicode = PyUnicode_FromUnicode(s, size);
1287 if (unicode == NULL)
1288 return NULL;
1289 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1290 Py_DECREF(unicode);
1291 return v;
1292}
1293
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001294PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1295 const char *encoding,
1296 const char *errors)
1297{
1298 PyObject *v;
1299
1300 if (!PyUnicode_Check(unicode)) {
1301 PyErr_BadArgument();
1302 goto onError;
1303 }
1304
1305 if (encoding == NULL)
1306 encoding = PyUnicode_GetDefaultEncoding();
1307
1308 /* Encode via the codec registry */
1309 v = PyCodec_Encode(unicode, encoding, errors);
1310 if (v == NULL)
1311 goto onError;
1312 return v;
1313
1314 onError:
1315 return NULL;
1316}
1317
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1319 const char *encoding,
1320 const char *errors)
1321{
1322 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001323
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 if (!PyUnicode_Check(unicode)) {
1325 PyErr_BadArgument();
1326 goto onError;
1327 }
Fred Drakee4315f52000-05-09 19:53:39 +00001328
Tim Petersced69f82003-09-16 20:30:58 +00001329 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001330 encoding = PyUnicode_GetDefaultEncoding();
1331
1332 /* Shortcuts for common default encodings */
1333 if (errors == NULL) {
1334 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001335 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001336 else if (strcmp(encoding, "latin-1") == 0)
1337 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001338#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1339 else if (strcmp(encoding, "mbcs") == 0)
1340 return PyUnicode_AsMBCSString(unicode);
1341#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001342 else if (strcmp(encoding, "ascii") == 0)
1343 return PyUnicode_AsASCIIString(unicode);
1344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345
1346 /* Encode via the codec registry */
1347 v = PyCodec_Encode(unicode, encoding, errors);
1348 if (v == NULL)
1349 goto onError;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001350 if (PyByteArray_Check(v)) {
1351 char msg[100];
1352 PyOS_snprintf(msg, sizeof(msg),
1353 "encoder %s returned buffer instead of bytes",
1354 encoding);
1355 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1356 v = NULL;
1357 goto onError;
1358 }
1359 v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1360 }
1361 else if (!PyBytes_Check(v)) {
1362 PyErr_Format(PyExc_TypeError,
1363 "encoder did not return a bytes object (type=%.400s)",
1364 Py_TYPE(v)->tp_name);
1365 v = NULL;
1366 }
1367 return v;
1368
1369 onError:
1370 return NULL;
1371}
1372
1373PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1374 const char *encoding,
1375 const char *errors)
1376{
1377 PyObject *v;
1378
1379 if (!PyUnicode_Check(unicode)) {
1380 PyErr_BadArgument();
1381 goto onError;
1382 }
1383
1384 if (encoding == NULL)
1385 encoding = PyUnicode_GetDefaultEncoding();
1386
1387 /* Encode via the codec registry */
1388 v = PyCodec_Encode(unicode, encoding, errors);
1389 if (v == NULL)
1390 goto onError;
1391 if (!PyUnicode_Check(v)) {
1392 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001393 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001394 Py_TYPE(v)->tp_name);
1395 Py_DECREF(v);
1396 goto onError;
1397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001399
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400 onError:
1401 return NULL;
1402}
1403
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001404PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1405 const char *errors)
1406{
1407 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001408 if (v)
1409 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001410 if (errors != NULL)
1411 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001412 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001413 PyUnicode_GET_SIZE(unicode),
1414 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001415 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001416 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001417 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001418 return v;
1419}
1420
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001421PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001422PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001423 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001424 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1425}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001426
Christian Heimes5894ba72007-11-04 11:43:14 +00001427PyObject*
1428PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1429{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001430 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1431 can be undefined. If it is case, decode using UTF-8. The following assumes
1432 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1433 bootstrapping process where the codecs aren't ready yet.
1434 */
1435 if (Py_FileSystemDefaultEncoding) {
1436#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001437 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001438 return PyUnicode_DecodeMBCS(s, size, "replace");
1439 }
1440#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001441 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001442 return PyUnicode_DecodeUTF8(s, size, "replace");
1443 }
1444#endif
1445 return PyUnicode_Decode(s, size,
1446 Py_FileSystemDefaultEncoding,
1447 "replace");
1448 }
1449 else {
1450 return PyUnicode_DecodeUTF8(s, size, "replace");
1451 }
1452}
1453
Martin v. Löwis5b222132007-06-10 09:51:05 +00001454char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001455_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001456{
Christian Heimesf3863112007-11-22 07:46:41 +00001457 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001458 if (!PyUnicode_Check(unicode)) {
1459 PyErr_BadArgument();
1460 return NULL;
1461 }
Christian Heimesf3863112007-11-22 07:46:41 +00001462 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1463 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001464 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001465 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001466 *psize = PyBytes_GET_SIZE(bytes);
1467 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001468}
1469
1470char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001471_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001472{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001473 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001474}
1475
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1477{
1478 if (!PyUnicode_Check(unicode)) {
1479 PyErr_BadArgument();
1480 goto onError;
1481 }
1482 return PyUnicode_AS_UNICODE(unicode);
1483
1484 onError:
1485 return NULL;
1486}
1487
Martin v. Löwis18e16552006-02-15 17:27:45 +00001488Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489{
1490 if (!PyUnicode_Check(unicode)) {
1491 PyErr_BadArgument();
1492 goto onError;
1493 }
1494 return PyUnicode_GET_SIZE(unicode);
1495
1496 onError:
1497 return -1;
1498}
1499
Thomas Wouters78890102000-07-22 19:25:51 +00001500const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001501{
1502 return unicode_default_encoding;
1503}
1504
1505int PyUnicode_SetDefaultEncoding(const char *encoding)
1506{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001507 if (strcmp(encoding, unicode_default_encoding) != 0) {
1508 PyErr_Format(PyExc_ValueError,
1509 "Can only set default encoding to %s",
1510 unicode_default_encoding);
1511 return -1;
1512 }
Fred Drakee4315f52000-05-09 19:53:39 +00001513 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001514}
1515
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516/* error handling callback helper:
1517 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001518 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 and adjust various state variables.
1520 return 0 on success, -1 on error
1521*/
1522
1523static
1524int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1525 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001526 const char **input, const char **inend, Py_ssize_t *startinpos,
1527 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001528 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001530 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001531
1532 PyObject *restuple = NULL;
1533 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001534 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001535 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001536 Py_ssize_t requiredsize;
1537 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001539 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001540 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 int res = -1;
1542
1543 if (*errorHandler == NULL) {
1544 *errorHandler = PyCodec_LookupError(errors);
1545 if (*errorHandler == NULL)
1546 goto onError;
1547 }
1548
1549 if (*exceptionObject == NULL) {
1550 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001551 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 if (*exceptionObject == NULL)
1553 goto onError;
1554 }
1555 else {
1556 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1557 goto onError;
1558 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1559 goto onError;
1560 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1561 goto onError;
1562 }
1563
1564 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1565 if (restuple == NULL)
1566 goto onError;
1567 if (!PyTuple_Check(restuple)) {
1568 PyErr_Format(PyExc_TypeError, &argparse[4]);
1569 goto onError;
1570 }
1571 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1572 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001573
1574 /* Copy back the bytes variables, which might have been modified by the
1575 callback */
1576 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1577 if (!inputobj)
1578 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001579 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001580 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1581 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001582 *input = PyBytes_AS_STRING(inputobj);
1583 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001584 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001585 /* we can DECREF safely, as the exception has another reference,
1586 so the object won't go away. */
1587 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001590 newpos = insize+newpos;
1591 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001592 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001593 goto onError;
1594 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001595
1596 /* need more space? (at least enough for what we
1597 have+the replacement+the rest of the string (starting
1598 at the new input position), so we won't have to check space
1599 when there are no errors in the rest of the string) */
1600 repptr = PyUnicode_AS_UNICODE(repunicode);
1601 repsize = PyUnicode_GET_SIZE(repunicode);
1602 requiredsize = *outpos + repsize + insize-newpos;
1603 if (requiredsize > outsize) {
1604 if (requiredsize<2*outsize)
1605 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001606 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001607 goto onError;
1608 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1609 }
1610 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001611 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 Py_UNICODE_COPY(*outptr, repptr, repsize);
1613 *outptr += repsize;
1614 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001616 /* we made it! */
1617 res = 0;
1618
1619 onError:
1620 Py_XDECREF(restuple);
1621 return res;
1622}
1623
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624/* --- UTF-7 Codec -------------------------------------------------------- */
1625
1626/* see RFC2152 for details */
1627
Tim Petersced69f82003-09-16 20:30:58 +00001628static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629char utf7_special[128] = {
1630 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1631 encoded:
1632 0 - not special
1633 1 - special
1634 2 - whitespace (optional)
1635 3 - RFC2152 Set O (optional) */
1636 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1637 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1638 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1640 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1642 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1643 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1644
1645};
1646
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001647/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1648 warnings about the comparison always being false; since
1649 utf7_special[0] is 1, we can safely make that one comparison
1650 true */
1651
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001653 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001654 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655 (encodeO && (utf7_special[(c)] == 3)))
1656
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001657#define B64(n) \
1658 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1659#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001660 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001661#define UB64(c) \
1662 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1663 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001665#define ENCODE(out, ch, bits) \
1666 while (bits >= 6) { \
1667 *out++ = B64(ch >> (bits-6)); \
1668 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669 }
1670
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001671#define DECODE(out, ch, bits, surrogate) \
1672 while (bits >= 16) { \
1673 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1674 bits -= 16; \
1675 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001676 /* We have already generated an error for the high surrogate \
1677 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001678 surrogate = 0; \
1679 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001681 it in a 16-bit character */ \
1682 surrogate = 1; \
1683 errmsg = "code pairs are not supported"; \
1684 goto utf7Error; \
1685 } else { \
1686 *out++ = outCh; \
1687 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 const char *errors)
1693{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001694 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1695}
1696
1697PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1698 Py_ssize_t size,
1699 const char *errors,
1700 Py_ssize_t *consumed)
1701{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001703 Py_ssize_t startinpos;
1704 Py_ssize_t endinpos;
1705 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001706 const char *e;
1707 PyUnicodeObject *unicode;
1708 Py_UNICODE *p;
1709 const char *errmsg = "";
1710 int inShift = 0;
1711 unsigned int bitsleft = 0;
1712 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 int surrogate = 0;
1714 PyObject *errorHandler = NULL;
1715 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716
1717 unicode = _PyUnicode_New(size);
1718 if (!unicode)
1719 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001720 if (size == 0) {
1721 if (consumed)
1722 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001724 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725
1726 p = unicode->str;
1727 e = s + size;
1728
1729 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 Py_UNICODE ch;
1731 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001732 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001733
1734 if (inShift) {
1735 if ((ch == '-') || !B64CHAR(ch)) {
1736 inShift = 0;
1737 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001738
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001739 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1740 if (bitsleft >= 6) {
1741 /* The shift sequence has a partial character in it. If
1742 bitsleft < 6 then we could just classify it as padding
1743 but that is not the case here */
1744
1745 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001746 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001747 }
1748 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001749 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001750 here so indicate the potential of a misencoded character. */
1751
1752 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1753 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1754 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001755 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001756 }
1757
1758 if (ch == '-') {
1759 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001760 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001761 inShift = 1;
1762 }
1763 } else if (SPECIAL(ch,0,0)) {
1764 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001765 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001766 } else {
1767 *p++ = ch;
1768 }
1769 } else {
1770 charsleft = (charsleft << 6) | UB64(ch);
1771 bitsleft += 6;
1772 s++;
1773 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1774 }
1775 }
1776 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001777 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001778 s++;
1779 if (s < e && *s == '-') {
1780 s++;
1781 *p++ = '+';
1782 } else
1783 {
1784 inShift = 1;
1785 bitsleft = 0;
1786 }
1787 }
1788 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001789 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001790 errmsg = "unexpected special character";
1791 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001792 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001793 }
1794 else {
1795 *p++ = ch;
1796 s++;
1797 }
1798 continue;
1799 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001800 outpos = p-PyUnicode_AS_UNICODE(unicode);
1801 endinpos = s-starts;
1802 if (unicode_decode_call_errorhandler(
1803 errors, &errorHandler,
1804 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001805 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 (PyObject **)&unicode, &outpos, &p))
1807 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001808 }
1809
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001810 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 outpos = p-PyUnicode_AS_UNICODE(unicode);
1812 endinpos = size;
1813 if (unicode_decode_call_errorhandler(
1814 errors, &errorHandler,
1815 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001816 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001818 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 if (s < e)
1820 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001821 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001822 if (consumed) {
1823 if(inShift)
1824 *consumed = startinpos;
1825 else
1826 *consumed = s-starts;
1827 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001828
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001829 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001830 goto onError;
1831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 Py_XDECREF(errorHandler);
1833 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 return (PyObject *)unicode;
1835
1836onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 Py_XDECREF(errorHandler);
1838 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001839 Py_DECREF(unicode);
1840 return NULL;
1841}
1842
1843
1844PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001845 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001846 int encodeSetO,
1847 int encodeWhiteSpace,
1848 const char *errors)
1849{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001850 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001851 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001852 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001853 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001854 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001855 unsigned int bitsleft = 0;
1856 unsigned long charsleft = 0;
1857 char * out;
1858 char * start;
1859
1860 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001861 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001862
Christian Heimes9c4756e2008-05-26 13:22:05 +00001863 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001864 if (v == NULL)
1865 return NULL;
1866
Christian Heimes9c4756e2008-05-26 13:22:05 +00001867 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001868 for (;i < size; ++i) {
1869 Py_UNICODE ch = s[i];
1870
1871 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001872 if (ch == '+') {
1873 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001874 *out++ = '-';
1875 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1876 charsleft = ch;
1877 bitsleft = 16;
1878 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001879 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001880 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001881 } else {
1882 *out++ = (char) ch;
1883 }
1884 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001885 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1886 *out++ = B64(charsleft << (6-bitsleft));
1887 charsleft = 0;
1888 bitsleft = 0;
1889 /* Characters not in the BASE64 set implicitly unshift the sequence
1890 so no '-' is required, except if the character is itself a '-' */
1891 if (B64CHAR(ch) || ch == '-') {
1892 *out++ = '-';
1893 }
1894 inShift = 0;
1895 *out++ = (char) ch;
1896 } else {
1897 bitsleft += 16;
1898 charsleft = (charsleft << 16) | ch;
1899 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1900
1901 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001902 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 or '-' then the shift sequence will be terminated implicitly and we
1904 don't have to insert a '-'. */
1905
1906 if (bitsleft == 0) {
1907 if (i + 1 < size) {
1908 Py_UNICODE ch2 = s[i+1];
1909
1910 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001911
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001912 } else if (B64CHAR(ch2) || ch2 == '-') {
1913 *out++ = '-';
1914 inShift = 0;
1915 } else {
1916 inShift = 0;
1917 }
1918
1919 }
1920 else {
1921 *out++ = '-';
1922 inShift = 0;
1923 }
1924 }
Tim Petersced69f82003-09-16 20:30:58 +00001925 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001927 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001928 if (bitsleft) {
1929 *out++= B64(charsleft << (6-bitsleft) );
1930 *out++ = '-';
1931 }
1932
Christian Heimes72b710a2008-05-26 13:28:38 +00001933 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001934 Py_DECREF(v);
1935 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001936}
1937
1938#undef SPECIAL
1939#undef B64
1940#undef B64CHAR
1941#undef UB64
1942#undef ENCODE
1943#undef DECODE
1944
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945/* --- UTF-8 Codec -------------------------------------------------------- */
1946
Tim Petersced69f82003-09-16 20:30:58 +00001947static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948char utf8_code_length[256] = {
1949 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1950 illegal prefix. see RFC 2279 for details */
1951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1964 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1965 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1966 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1967};
1968
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001970 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 const char *errors)
1972{
Walter Dörwald69652032004-09-07 20:24:22 +00001973 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1974}
1975
1976PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001977 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001978 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001979 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001980{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001981 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001983 Py_ssize_t startinpos;
1984 Py_ssize_t endinpos;
1985 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 const char *e;
1987 PyUnicodeObject *unicode;
1988 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001989 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 PyObject *errorHandler = NULL;
1991 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992
1993 /* Note: size will always be longer than the resulting Unicode
1994 character count */
1995 unicode = _PyUnicode_New(size);
1996 if (!unicode)
1997 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001998 if (size == 0) {
1999 if (consumed)
2000 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003
2004 /* Unpack UTF-8 encoded data */
2005 p = unicode->str;
2006 e = s + size;
2007
2008 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010
2011 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002012 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 s++;
2014 continue;
2015 }
2016
2017 n = utf8_code_length[ch];
2018
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002019 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002020 if (consumed)
2021 break;
2022 else {
2023 errmsg = "unexpected end of data";
2024 startinpos = s-starts;
2025 endinpos = size;
2026 goto utf8Error;
2027 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029
2030 switch (n) {
2031
2032 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002033 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002034 startinpos = s-starts;
2035 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002036 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037
2038 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002039 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002040 startinpos = s-starts;
2041 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002042 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043
2044 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002045 if ((s[1] & 0xc0) != 0x80) {
2046 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047 startinpos = s-starts;
2048 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002049 goto utf8Error;
2050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002052 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 startinpos = s-starts;
2054 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002055 errmsg = "illegal encoding";
2056 goto utf8Error;
2057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002059 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 break;
2061
2062 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002063 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002064 (s[2] & 0xc0) != 0x80) {
2065 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 startinpos = s-starts;
2067 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002068 goto utf8Error;
2069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002071 if (ch < 0x0800) {
2072 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002073 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002074
2075 XXX For wide builds (UCS-4) we should probably try
2076 to recombine the surrogates into a single code
2077 unit.
2078 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002079 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002080 startinpos = s-starts;
2081 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002082 goto utf8Error;
2083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002086 break;
2087
2088 case 4:
2089 if ((s[1] & 0xc0) != 0x80 ||
2090 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002091 (s[3] & 0xc0) != 0x80) {
2092 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 startinpos = s-starts;
2094 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002095 goto utf8Error;
2096 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002097 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2098 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2099 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002100 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002101 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002102 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002103 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002104 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002105 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002106 startinpos = s-starts;
2107 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002108 goto utf8Error;
2109 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002110#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002111 *p++ = (Py_UNICODE)ch;
2112#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002113 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002114
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002115 /* translate from 10000..10FFFF to 0..FFFF */
2116 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002117
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002118 /* high surrogate = top 10 bits added to D800 */
2119 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002120
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002121 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002122 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002123#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 break;
2125
2126 default:
2127 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002128 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129 startinpos = s-starts;
2130 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002131 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 }
2133 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002134 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002135
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002136 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 outpos = p-PyUnicode_AS_UNICODE(unicode);
2138 if (unicode_decode_call_errorhandler(
2139 errors, &errorHandler,
2140 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002141 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 (PyObject **)&unicode, &outpos, &p))
2143 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 }
Walter Dörwald69652032004-09-07 20:24:22 +00002145 if (consumed)
2146 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147
2148 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002149 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 goto onError;
2151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 Py_XDECREF(errorHandler);
2153 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 return (PyObject *)unicode;
2155
2156onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002157 Py_XDECREF(errorHandler);
2158 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 Py_DECREF(unicode);
2160 return NULL;
2161}
2162
Tim Peters602f7402002-04-27 18:03:26 +00002163/* Allocation strategy: if the string is short, convert into a stack buffer
2164 and allocate exactly as much space needed at the end. Else allocate the
2165 maximum possible needed (4 result bytes per Unicode character), and return
2166 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002167*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002168PyObject *
2169PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002170 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002171 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172{
Tim Peters602f7402002-04-27 18:03:26 +00002173#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002174
Guido van Rossum98297ee2007-11-06 21:34:58 +00002175 Py_ssize_t i; /* index into s of next input byte */
2176 PyObject *result; /* result string object */
2177 char *p; /* next free byte in output buffer */
2178 Py_ssize_t nallocated; /* number of result bytes allocated */
2179 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002180 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002181
Tim Peters602f7402002-04-27 18:03:26 +00002182 assert(s != NULL);
2183 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184
Tim Peters602f7402002-04-27 18:03:26 +00002185 if (size <= MAX_SHORT_UNICHARS) {
2186 /* Write into the stack buffer; nallocated can't overflow.
2187 * At the end, we'll allocate exactly as much heap space as it
2188 * turns out we need.
2189 */
2190 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002191 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002192 p = stackbuf;
2193 }
2194 else {
2195 /* Overallocate on the heap, and give the excess back at the end. */
2196 nallocated = size * 4;
2197 if (nallocated / 4 != size) /* overflow! */
2198 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002199 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002200 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002201 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002202 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002203 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002204
Tim Peters602f7402002-04-27 18:03:26 +00002205 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002206 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002207
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002208 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002209 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002211
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002213 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002214 *p++ = (char)(0xc0 | (ch >> 6));
2215 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002216 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002217 else {
Tim Peters602f7402002-04-27 18:03:26 +00002218 /* Encode UCS2 Unicode ordinals */
2219 if (ch < 0x10000) {
2220 /* Special case: check for high surrogate */
2221 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2222 Py_UCS4 ch2 = s[i];
2223 /* Check for low surrogate and combine the two to
2224 form a UCS4 value */
2225 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002226 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002227 i++;
2228 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002229 }
Tim Peters602f7402002-04-27 18:03:26 +00002230 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002231 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002232 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002233 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2234 *p++ = (char)(0x80 | (ch & 0x3f));
2235 continue;
2236 }
2237encodeUCS4:
2238 /* Encode UCS4 Unicode ordinals */
2239 *p++ = (char)(0xf0 | (ch >> 18));
2240 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2241 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2242 *p++ = (char)(0x80 | (ch & 0x3f));
2243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002245
Guido van Rossum98297ee2007-11-06 21:34:58 +00002246 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002247 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002248 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002249 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002250 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002251 }
2252 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002253 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002254 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002255 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002256 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002257 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002258 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002259
Tim Peters602f7402002-04-27 18:03:26 +00002260#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261}
2262
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2264{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 if (!PyUnicode_Check(unicode)) {
2266 PyErr_BadArgument();
2267 return NULL;
2268 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002269 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2270 PyUnicode_GET_SIZE(unicode),
2271 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272}
2273
Walter Dörwald41980ca2007-08-16 21:55:45 +00002274/* --- UTF-32 Codec ------------------------------------------------------- */
2275
2276PyObject *
2277PyUnicode_DecodeUTF32(const char *s,
2278 Py_ssize_t size,
2279 const char *errors,
2280 int *byteorder)
2281{
2282 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2283}
2284
2285PyObject *
2286PyUnicode_DecodeUTF32Stateful(const char *s,
2287 Py_ssize_t size,
2288 const char *errors,
2289 int *byteorder,
2290 Py_ssize_t *consumed)
2291{
2292 const char *starts = s;
2293 Py_ssize_t startinpos;
2294 Py_ssize_t endinpos;
2295 Py_ssize_t outpos;
2296 PyUnicodeObject *unicode;
2297 Py_UNICODE *p;
2298#ifndef Py_UNICODE_WIDE
2299 int i, pairs;
2300#else
2301 const int pairs = 0;
2302#endif
2303 const unsigned char *q, *e;
2304 int bo = 0; /* assume native ordering by default */
2305 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002306 /* Offsets from q for retrieving bytes in the right order. */
2307#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2308 int iorder[] = {0, 1, 2, 3};
2309#else
2310 int iorder[] = {3, 2, 1, 0};
2311#endif
2312 PyObject *errorHandler = NULL;
2313 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002314 /* On narrow builds we split characters outside the BMP into two
2315 codepoints => count how much extra space we need. */
2316#ifndef Py_UNICODE_WIDE
2317 for (i = pairs = 0; i < size/4; i++)
2318 if (((Py_UCS4 *)s)[i] >= 0x10000)
2319 pairs++;
2320#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002321
2322 /* This might be one to much, because of a BOM */
2323 unicode = _PyUnicode_New((size+3)/4+pairs);
2324 if (!unicode)
2325 return NULL;
2326 if (size == 0)
2327 return (PyObject *)unicode;
2328
2329 /* Unpack UTF-32 encoded data */
2330 p = unicode->str;
2331 q = (unsigned char *)s;
2332 e = q + size;
2333
2334 if (byteorder)
2335 bo = *byteorder;
2336
2337 /* Check for BOM marks (U+FEFF) in the input and adjust current
2338 byte order setting accordingly. In native mode, the leading BOM
2339 mark is skipped, in all other modes, it is copied to the output
2340 stream as-is (giving a ZWNBSP character). */
2341 if (bo == 0) {
2342 if (size >= 4) {
2343 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2344 (q[iorder[1]] << 8) | q[iorder[0]];
2345#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2346 if (bom == 0x0000FEFF) {
2347 q += 4;
2348 bo = -1;
2349 }
2350 else if (bom == 0xFFFE0000) {
2351 q += 4;
2352 bo = 1;
2353 }
2354#else
2355 if (bom == 0x0000FEFF) {
2356 q += 4;
2357 bo = 1;
2358 }
2359 else if (bom == 0xFFFE0000) {
2360 q += 4;
2361 bo = -1;
2362 }
2363#endif
2364 }
2365 }
2366
2367 if (bo == -1) {
2368 /* force LE */
2369 iorder[0] = 0;
2370 iorder[1] = 1;
2371 iorder[2] = 2;
2372 iorder[3] = 3;
2373 }
2374 else if (bo == 1) {
2375 /* force BE */
2376 iorder[0] = 3;
2377 iorder[1] = 2;
2378 iorder[2] = 1;
2379 iorder[3] = 0;
2380 }
2381
2382 while (q < e) {
2383 Py_UCS4 ch;
2384 /* remaining bytes at the end? (size should be divisible by 4) */
2385 if (e-q<4) {
2386 if (consumed)
2387 break;
2388 errmsg = "truncated data";
2389 startinpos = ((const char *)q)-starts;
2390 endinpos = ((const char *)e)-starts;
2391 goto utf32Error;
2392 /* The remaining input chars are ignored if the callback
2393 chooses to skip the input */
2394 }
2395 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2396 (q[iorder[1]] << 8) | q[iorder[0]];
2397
2398 if (ch >= 0x110000)
2399 {
2400 errmsg = "codepoint not in range(0x110000)";
2401 startinpos = ((const char *)q)-starts;
2402 endinpos = startinpos+4;
2403 goto utf32Error;
2404 }
2405#ifndef Py_UNICODE_WIDE
2406 if (ch >= 0x10000)
2407 {
2408 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2409 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2410 }
2411 else
2412#endif
2413 *p++ = ch;
2414 q += 4;
2415 continue;
2416 utf32Error:
2417 outpos = p-PyUnicode_AS_UNICODE(unicode);
2418 if (unicode_decode_call_errorhandler(
2419 errors, &errorHandler,
2420 "utf32", errmsg,
2421 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2422 (PyObject **)&unicode, &outpos, &p))
2423 goto onError;
2424 }
2425
2426 if (byteorder)
2427 *byteorder = bo;
2428
2429 if (consumed)
2430 *consumed = (const char *)q-starts;
2431
2432 /* Adjust length */
2433 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2434 goto onError;
2435
2436 Py_XDECREF(errorHandler);
2437 Py_XDECREF(exc);
2438 return (PyObject *)unicode;
2439
2440onError:
2441 Py_DECREF(unicode);
2442 Py_XDECREF(errorHandler);
2443 Py_XDECREF(exc);
2444 return NULL;
2445}
2446
2447PyObject *
2448PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2449 Py_ssize_t size,
2450 const char *errors,
2451 int byteorder)
2452{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002453 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002454 unsigned char *p;
2455#ifndef Py_UNICODE_WIDE
2456 int i, pairs;
2457#else
2458 const int pairs = 0;
2459#endif
2460 /* Offsets from p for storing byte pairs in the right order. */
2461#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2462 int iorder[] = {0, 1, 2, 3};
2463#else
2464 int iorder[] = {3, 2, 1, 0};
2465#endif
2466
2467#define STORECHAR(CH) \
2468 do { \
2469 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2470 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2471 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2472 p[iorder[0]] = (CH) & 0xff; \
2473 p += 4; \
2474 } while(0)
2475
2476 /* In narrow builds we can output surrogate pairs as one codepoint,
2477 so we need less space. */
2478#ifndef Py_UNICODE_WIDE
2479 for (i = pairs = 0; i < size-1; i++)
2480 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2481 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2482 pairs++;
2483#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002484 v = PyByteArray_FromStringAndSize(NULL,
Walter Dörwald41980ca2007-08-16 21:55:45 +00002485 4 * (size - pairs + (byteorder == 0)));
2486 if (v == NULL)
2487 return NULL;
2488
Christian Heimes9c4756e2008-05-26 13:22:05 +00002489 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002490 if (byteorder == 0)
2491 STORECHAR(0xFEFF);
2492 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002493 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002494
2495 if (byteorder == -1) {
2496 /* force LE */
2497 iorder[0] = 0;
2498 iorder[1] = 1;
2499 iorder[2] = 2;
2500 iorder[3] = 3;
2501 }
2502 else if (byteorder == 1) {
2503 /* force BE */
2504 iorder[0] = 3;
2505 iorder[1] = 2;
2506 iorder[2] = 1;
2507 iorder[3] = 0;
2508 }
2509
2510 while (size-- > 0) {
2511 Py_UCS4 ch = *s++;
2512#ifndef Py_UNICODE_WIDE
2513 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2514 Py_UCS4 ch2 = *s;
2515 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2516 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2517 s++;
2518 size--;
2519 }
2520 }
2521#endif
2522 STORECHAR(ch);
2523 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002524
2525 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002526 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002527 Py_DECREF(v);
2528 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002529#undef STORECHAR
2530}
2531
2532PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2533{
2534 if (!PyUnicode_Check(unicode)) {
2535 PyErr_BadArgument();
2536 return NULL;
2537 }
2538 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2539 PyUnicode_GET_SIZE(unicode),
2540 NULL,
2541 0);
2542}
2543
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544/* --- UTF-16 Codec ------------------------------------------------------- */
2545
Tim Peters772747b2001-08-09 22:21:55 +00002546PyObject *
2547PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002548 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002549 const char *errors,
2550 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551{
Walter Dörwald69652032004-09-07 20:24:22 +00002552 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2553}
2554
2555PyObject *
2556PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002557 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002558 const char *errors,
2559 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002560 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002563 Py_ssize_t startinpos;
2564 Py_ssize_t endinpos;
2565 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 PyUnicodeObject *unicode;
2567 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002568 const unsigned char *q, *e;
2569 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002570 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002571 /* Offsets from q for retrieving byte pairs in the right order. */
2572#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2573 int ihi = 1, ilo = 0;
2574#else
2575 int ihi = 0, ilo = 1;
2576#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 PyObject *errorHandler = NULL;
2578 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579
2580 /* Note: size will always be longer than the resulting Unicode
2581 character count */
2582 unicode = _PyUnicode_New(size);
2583 if (!unicode)
2584 return NULL;
2585 if (size == 0)
2586 return (PyObject *)unicode;
2587
2588 /* Unpack UTF-16 encoded data */
2589 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002590 q = (unsigned char *)s;
2591 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592
2593 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002594 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002596 /* Check for BOM marks (U+FEFF) in the input and adjust current
2597 byte order setting accordingly. In native mode, the leading BOM
2598 mark is skipped, in all other modes, it is copied to the output
2599 stream as-is (giving a ZWNBSP character). */
2600 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002601 if (size >= 2) {
2602 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002603#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002604 if (bom == 0xFEFF) {
2605 q += 2;
2606 bo = -1;
2607 }
2608 else if (bom == 0xFFFE) {
2609 q += 2;
2610 bo = 1;
2611 }
Tim Petersced69f82003-09-16 20:30:58 +00002612#else
Walter Dörwald69652032004-09-07 20:24:22 +00002613 if (bom == 0xFEFF) {
2614 q += 2;
2615 bo = 1;
2616 }
2617 else if (bom == 0xFFFE) {
2618 q += 2;
2619 bo = -1;
2620 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002621#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002622 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002623 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624
Tim Peters772747b2001-08-09 22:21:55 +00002625 if (bo == -1) {
2626 /* force LE */
2627 ihi = 1;
2628 ilo = 0;
2629 }
2630 else if (bo == 1) {
2631 /* force BE */
2632 ihi = 0;
2633 ilo = 1;
2634 }
2635
2636 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002638 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002639 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002640 if (consumed)
2641 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002642 errmsg = "truncated data";
2643 startinpos = ((const char *)q)-starts;
2644 endinpos = ((const char *)e)-starts;
2645 goto utf16Error;
2646 /* The remaining input chars are ignored if the callback
2647 chooses to skip the input */
2648 }
2649 ch = (q[ihi] << 8) | q[ilo];
2650
Tim Peters772747b2001-08-09 22:21:55 +00002651 q += 2;
2652
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 if (ch < 0xD800 || ch > 0xDFFF) {
2654 *p++ = ch;
2655 continue;
2656 }
2657
2658 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002659 if (q >= e) {
2660 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 startinpos = (((const char *)q)-2)-starts;
2662 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002663 goto utf16Error;
2664 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002665 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002666 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2667 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002668 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002669#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002670 *p++ = ch;
2671 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002672#else
2673 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002674#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002675 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002676 }
2677 else {
2678 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 startinpos = (((const char *)q)-4)-starts;
2680 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002681 goto utf16Error;
2682 }
2683
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002685 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002686 startinpos = (((const char *)q)-2)-starts;
2687 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002688 /* Fall through to report the error */
2689
2690 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002691 outpos = p-PyUnicode_AS_UNICODE(unicode);
2692 if (unicode_decode_call_errorhandler(
2693 errors, &errorHandler,
2694 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002695 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002696 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 }
2699
2700 if (byteorder)
2701 *byteorder = bo;
2702
Walter Dörwald69652032004-09-07 20:24:22 +00002703 if (consumed)
2704 *consumed = (const char *)q-starts;
2705
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002707 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 goto onError;
2709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 Py_XDECREF(errorHandler);
2711 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 return (PyObject *)unicode;
2713
2714onError:
2715 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 Py_XDECREF(errorHandler);
2717 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 return NULL;
2719}
2720
Tim Peters772747b2001-08-09 22:21:55 +00002721PyObject *
2722PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002723 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002724 const char *errors,
2725 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002727 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002728 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002729#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002730 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002731#else
2732 const int pairs = 0;
2733#endif
Tim Peters772747b2001-08-09 22:21:55 +00002734 /* Offsets from p for storing byte pairs in the right order. */
2735#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2736 int ihi = 1, ilo = 0;
2737#else
2738 int ihi = 0, ilo = 1;
2739#endif
2740
2741#define STORECHAR(CH) \
2742 do { \
2743 p[ihi] = ((CH) >> 8) & 0xff; \
2744 p[ilo] = (CH) & 0xff; \
2745 p += 2; \
2746 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002748#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002749 for (i = pairs = 0; i < size; i++)
2750 if (s[i] >= 0x10000)
2751 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002752#endif
Christian Heimes9c4756e2008-05-26 13:22:05 +00002753 v = PyByteArray_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002754 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 if (v == NULL)
2756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757
Christian Heimes9c4756e2008-05-26 13:22:05 +00002758 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002760 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002761 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002762 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002763
2764 if (byteorder == -1) {
2765 /* force LE */
2766 ihi = 1;
2767 ilo = 0;
2768 }
2769 else if (byteorder == 1) {
2770 /* force BE */
2771 ihi = 0;
2772 ilo = 1;
2773 }
2774
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002775 while (size-- > 0) {
2776 Py_UNICODE ch = *s++;
2777 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002778#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002779 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002780 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2781 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002783#endif
Tim Peters772747b2001-08-09 22:21:55 +00002784 STORECHAR(ch);
2785 if (ch2)
2786 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002787 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002788
2789 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002790 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002791 Py_DECREF(v);
2792 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002793#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794}
2795
2796PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2797{
2798 if (!PyUnicode_Check(unicode)) {
2799 PyErr_BadArgument();
2800 return NULL;
2801 }
2802 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2803 PyUnicode_GET_SIZE(unicode),
2804 NULL,
2805 0);
2806}
2807
2808/* --- Unicode Escape Codec ----------------------------------------------- */
2809
Fredrik Lundh06d12682001-01-24 07:59:11 +00002810static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002811
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002813 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 const char *errors)
2815{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002817 Py_ssize_t startinpos;
2818 Py_ssize_t endinpos;
2819 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002824 char* message;
2825 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 PyObject *errorHandler = NULL;
2827 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002828
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 /* Escaped strings will always be longer than the resulting
2830 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 length after conversion to the true value.
2832 (but if the error callback returns a long replacement string
2833 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 v = _PyUnicode_New(size);
2835 if (v == NULL)
2836 goto onError;
2837 if (size == 0)
2838 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002839
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002842
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 while (s < end) {
2844 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002845 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847
2848 /* Non-escape characters are interpreted as Unicode ordinals */
2849 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002850 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 continue;
2852 }
2853
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 /* \ - Escapes */
2856 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002857 c = *s++;
2858 if (s > end)
2859 c = '\0'; /* Invalid after \ */
2860 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861
2862 /* \x escapes */
2863 case '\n': break;
2864 case '\\': *p++ = '\\'; break;
2865 case '\'': *p++ = '\''; break;
2866 case '\"': *p++ = '\"'; break;
2867 case 'b': *p++ = '\b'; break;
2868 case 'f': *p++ = '\014'; break; /* FF */
2869 case 't': *p++ = '\t'; break;
2870 case 'n': *p++ = '\n'; break;
2871 case 'r': *p++ = '\r'; break;
2872 case 'v': *p++ = '\013'; break; /* VT */
2873 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2874
2875 /* \OOO (octal) escapes */
2876 case '0': case '1': case '2': case '3':
2877 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002878 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002879 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002880 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002881 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002882 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002884 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 break;
2886
Fredrik Lundhccc74732001-02-18 22:13:49 +00002887 /* hex escapes */
2888 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002890 digits = 2;
2891 message = "truncated \\xXX escape";
2892 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893
Fredrik Lundhccc74732001-02-18 22:13:49 +00002894 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002896 digits = 4;
2897 message = "truncated \\uXXXX escape";
2898 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899
Fredrik Lundhccc74732001-02-18 22:13:49 +00002900 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002901 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002902 digits = 8;
2903 message = "truncated \\UXXXXXXXX escape";
2904 hexescape:
2905 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002906 outpos = p-PyUnicode_AS_UNICODE(v);
2907 if (s+digits>end) {
2908 endinpos = size;
2909 if (unicode_decode_call_errorhandler(
2910 errors, &errorHandler,
2911 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002912 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 (PyObject **)&v, &outpos, &p))
2914 goto onError;
2915 goto nextByte;
2916 }
2917 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002918 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002919 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 endinpos = (s+i+1)-starts;
2921 if (unicode_decode_call_errorhandler(
2922 errors, &errorHandler,
2923 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002924 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002926 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002928 }
2929 chr = (chr<<4) & ~0xF;
2930 if (c >= '0' && c <= '9')
2931 chr += c - '0';
2932 else if (c >= 'a' && c <= 'f')
2933 chr += 10 + c - 'a';
2934 else
2935 chr += 10 + c - 'A';
2936 }
2937 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002938 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 /* _decoding_error will have already written into the
2940 target buffer. */
2941 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002942 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002943 /* when we get here, chr is a 32-bit unicode character */
2944 if (chr <= 0xffff)
2945 /* UCS-2 character */
2946 *p++ = (Py_UNICODE) chr;
2947 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002948 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002949 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002950#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002951 *p++ = chr;
2952#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002953 chr -= 0x10000L;
2954 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002955 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002956#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002957 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958 endinpos = s-starts;
2959 outpos = p-PyUnicode_AS_UNICODE(v);
2960 if (unicode_decode_call_errorhandler(
2961 errors, &errorHandler,
2962 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002964 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002965 goto onError;
2966 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002967 break;
2968
2969 /* \N{name} */
2970 case 'N':
2971 message = "malformed \\N character escape";
2972 if (ucnhash_CAPI == NULL) {
2973 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002974 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00002975 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002976 if (m == NULL)
2977 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002978 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002979 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002980 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002981 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002982 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002983 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002984 if (ucnhash_CAPI == NULL)
2985 goto ucnhashError;
2986 }
2987 if (*s == '{') {
2988 const char *start = s+1;
2989 /* look for the closing brace */
2990 while (*s != '}' && s < end)
2991 s++;
2992 if (s > start && s < end && *s == '}') {
2993 /* found a name. look it up in the unicode database */
2994 message = "unknown Unicode character name";
2995 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002996 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002997 goto store;
2998 }
2999 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 endinpos = s-starts;
3001 outpos = p-PyUnicode_AS_UNICODE(v);
3002 if (unicode_decode_call_errorhandler(
3003 errors, &errorHandler,
3004 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003005 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003007 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003008 break;
3009
3010 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003011 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 message = "\\ at end of string";
3013 s--;
3014 endinpos = s-starts;
3015 outpos = p-PyUnicode_AS_UNICODE(v);
3016 if (unicode_decode_call_errorhandler(
3017 errors, &errorHandler,
3018 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003019 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003021 goto onError;
3022 }
3023 else {
3024 *p++ = '\\';
3025 *p++ = (unsigned char)s[-1];
3026 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003027 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029 nextByte:
3030 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003032 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003034 Py_XDECREF(errorHandler);
3035 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003037
Fredrik Lundhccc74732001-02-18 22:13:49 +00003038ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003039 PyErr_SetString(
3040 PyExc_UnicodeError,
3041 "\\N escapes not supported (can't load unicodedata module)"
3042 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003043 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 Py_XDECREF(errorHandler);
3045 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003046 return NULL;
3047
Fredrik Lundhccc74732001-02-18 22:13:49 +00003048onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 Py_XDECREF(errorHandler);
3051 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 return NULL;
3053}
3054
3055/* Return a Unicode-Escape string version of the Unicode object.
3056
3057 If quotes is true, the string is enclosed in u"" or u'' quotes as
3058 appropriate.
3059
3060*/
3061
Thomas Wouters477c8d52006-05-27 19:21:47 +00003062Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3063 Py_ssize_t size,
3064 Py_UNICODE ch)
3065{
3066 /* like wcschr, but doesn't stop at NULL characters */
3067
3068 while (size-- > 0) {
3069 if (*s == ch)
3070 return s;
3071 s++;
3072 }
3073
3074 return NULL;
3075}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003076
Walter Dörwald79e913e2007-05-12 11:08:06 +00003077static const char *hexdigits = "0123456789abcdef";
3078
3079PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3080 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003082 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084
Thomas Wouters89f507f2006-12-13 04:49:30 +00003085 /* XXX(nnorwitz): rather than over-allocating, it would be
3086 better to choose a different scheme. Perhaps scan the
3087 first N-chars of the string and allocate based on that size.
3088 */
3089 /* Initial allocation is based on the longest-possible unichr
3090 escape.
3091
3092 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3093 unichr, so in this case it's the longest unichr escape. In
3094 narrow (UTF-16) builds this is five chars per source unichr
3095 since there are two unichrs in the surrogate pair, so in narrow
3096 (UTF-16) builds it's not the longest unichr escape.
3097
3098 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3099 so in the narrow (UTF-16) build case it's the longest unichr
3100 escape.
3101 */
3102
Christian Heimes9c4756e2008-05-26 13:22:05 +00003103 repr = PyByteArray_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00003104#ifdef Py_UNICODE_WIDE
3105 + 10*size
3106#else
3107 + 6*size
3108#endif
3109 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 if (repr == NULL)
3111 return NULL;
3112
Christian Heimes9c4756e2008-05-26 13:22:05 +00003113 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 while (size-- > 0) {
3116 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003117
Walter Dörwald79e913e2007-05-12 11:08:06 +00003118 /* Escape backslashes */
3119 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 *p++ = '\\';
3121 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003122 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003123 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003124
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003125#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003126 /* Map 21-bit characters to '\U00xxxxxx' */
3127 else if (ch >= 0x10000) {
3128 *p++ = '\\';
3129 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003130 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3131 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3132 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3133 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3134 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3135 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3136 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3137 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003138 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003139 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003140#else
3141 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003142 else if (ch >= 0xD800 && ch < 0xDC00) {
3143 Py_UNICODE ch2;
3144 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003145
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003146 ch2 = *s++;
3147 size--;
3148 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3149 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3150 *p++ = '\\';
3151 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003152 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3153 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3154 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3155 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3156 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3157 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3158 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3159 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003160 continue;
3161 }
3162 /* Fall through: isolated surrogates are copied as-is */
3163 s--;
3164 size++;
3165 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003166#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003167
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003169 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 *p++ = '\\';
3171 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003172 *p++ = hexdigits[(ch >> 12) & 0x000F];
3173 *p++ = hexdigits[(ch >> 8) & 0x000F];
3174 *p++ = hexdigits[(ch >> 4) & 0x000F];
3175 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003177
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003178 /* Map special whitespace to '\t', \n', '\r' */
3179 else if (ch == '\t') {
3180 *p++ = '\\';
3181 *p++ = 't';
3182 }
3183 else if (ch == '\n') {
3184 *p++ = '\\';
3185 *p++ = 'n';
3186 }
3187 else if (ch == '\r') {
3188 *p++ = '\\';
3189 *p++ = 'r';
3190 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003191
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003192 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003193 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003195 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003196 *p++ = hexdigits[(ch >> 4) & 0x000F];
3197 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003198 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003199
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 /* Copy everything else as-is */
3201 else
3202 *p++ = (char) ch;
3203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204
Christian Heimes72b710a2008-05-26 13:28:38 +00003205 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003206 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003207 Py_DECREF(repr);
3208 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209}
3210
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3212{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003213 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 if (!PyUnicode_Check(unicode)) {
3215 PyErr_BadArgument();
3216 return NULL;
3217 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003218 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3219 PyUnicode_GET_SIZE(unicode));
3220
3221 if (!s)
3222 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003223 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003224 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003225 Py_DECREF(s);
3226 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227}
3228
3229/* --- Raw Unicode Escape Codec ------------------------------------------- */
3230
3231PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003232 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233 const char *errors)
3234{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003235 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003236 Py_ssize_t startinpos;
3237 Py_ssize_t endinpos;
3238 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 const char *end;
3242 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 PyObject *errorHandler = NULL;
3244 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003245
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 /* Escaped strings will always be longer than the resulting
3247 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 length after conversion to the true value. (But decoding error
3249 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 v = _PyUnicode_New(size);
3251 if (v == NULL)
3252 goto onError;
3253 if (size == 0)
3254 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 end = s + size;
3257 while (s < end) {
3258 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003259 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003261 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262
3263 /* Non-escape characters are interpreted as Unicode ordinals */
3264 if (*s != '\\') {
3265 *p++ = (unsigned char)*s++;
3266 continue;
3267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269
3270 /* \u-escapes are only interpreted iff the number of leading
3271 backslashes if odd */
3272 bs = s;
3273 for (;s < end;) {
3274 if (*s != '\\')
3275 break;
3276 *p++ = (unsigned char)*s++;
3277 }
3278 if (((s - bs) & 1) == 0 ||
3279 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003280 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 continue;
3282 }
3283 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003284 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 s++;
3286
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003287 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003289 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003291 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 endinpos = s-starts;
3293 if (unicode_decode_call_errorhandler(
3294 errors, &errorHandler,
3295 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003296 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 }
3301 x = (x<<4) & ~0xF;
3302 if (c >= '0' && c <= '9')
3303 x += c - '0';
3304 else if (c >= 'a' && c <= 'f')
3305 x += 10 + c - 'a';
3306 else
3307 x += 10 + c - 'A';
3308 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003309 if (x <= 0xffff)
3310 /* UCS-2 character */
3311 *p++ = (Py_UNICODE) x;
3312 else if (x <= 0x10ffff) {
3313 /* UCS-4 character. Either store directly, or as
3314 surrogate pair. */
3315#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003316 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003317#else
3318 x -= 0x10000L;
3319 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3320 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3321#endif
3322 } else {
3323 endinpos = s-starts;
3324 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003325 if (unicode_decode_call_errorhandler(
3326 errors, &errorHandler,
3327 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003328 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003329 (PyObject **)&v, &outpos, &p))
3330 goto onError;
3331 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003332 nextByte:
3333 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003335 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003336 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 Py_XDECREF(errorHandler);
3338 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003340
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 onError:
3342 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 Py_XDECREF(errorHandler);
3344 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 return NULL;
3346}
3347
3348PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003349 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003351 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 char *p;
3353 char *q;
3354
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003355#ifdef Py_UNICODE_WIDE
Christian Heimes9c4756e2008-05-26 13:22:05 +00003356 repr = PyByteArray_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003357#else
Christian Heimes9c4756e2008-05-26 13:22:05 +00003358 repr = PyByteArray_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003359#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 if (repr == NULL)
3361 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003362 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003363 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364
Christian Heimes9c4756e2008-05-26 13:22:05 +00003365 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 while (size-- > 0) {
3367 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003368#ifdef Py_UNICODE_WIDE
3369 /* Map 32-bit characters to '\Uxxxxxxxx' */
3370 if (ch >= 0x10000) {
3371 *p++ = '\\';
3372 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003373 *p++ = hexdigits[(ch >> 28) & 0xf];
3374 *p++ = hexdigits[(ch >> 24) & 0xf];
3375 *p++ = hexdigits[(ch >> 20) & 0xf];
3376 *p++ = hexdigits[(ch >> 16) & 0xf];
3377 *p++ = hexdigits[(ch >> 12) & 0xf];
3378 *p++ = hexdigits[(ch >> 8) & 0xf];
3379 *p++ = hexdigits[(ch >> 4) & 0xf];
3380 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003381 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003382 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003383#else
3384 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3385 if (ch >= 0xD800 && ch < 0xDC00) {
3386 Py_UNICODE ch2;
3387 Py_UCS4 ucs;
3388
3389 ch2 = *s++;
3390 size--;
3391 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3392 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3393 *p++ = '\\';
3394 *p++ = 'U';
3395 *p++ = hexdigits[(ucs >> 28) & 0xf];
3396 *p++ = hexdigits[(ucs >> 24) & 0xf];
3397 *p++ = hexdigits[(ucs >> 20) & 0xf];
3398 *p++ = hexdigits[(ucs >> 16) & 0xf];
3399 *p++ = hexdigits[(ucs >> 12) & 0xf];
3400 *p++ = hexdigits[(ucs >> 8) & 0xf];
3401 *p++ = hexdigits[(ucs >> 4) & 0xf];
3402 *p++ = hexdigits[ucs & 0xf];
3403 continue;
3404 }
3405 /* Fall through: isolated surrogates are copied as-is */
3406 s--;
3407 size++;
3408 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003409#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 /* Map 16-bit characters to '\uxxxx' */
3411 if (ch >= 256) {
3412 *p++ = '\\';
3413 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003414 *p++ = hexdigits[(ch >> 12) & 0xf];
3415 *p++ = hexdigits[(ch >> 8) & 0xf];
3416 *p++ = hexdigits[(ch >> 4) & 0xf];
3417 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418 }
3419 /* Copy everything else as-is */
3420 else
3421 *p++ = (char) ch;
3422 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003423 size = p - q;
3424
3425 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003426 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003427 Py_DECREF(repr);
3428 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429}
3430
3431PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3432{
Walter Dörwald711005d2007-05-12 12:03:26 +00003433 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003435 PyErr_BadArgument();
3436 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003438 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3439 PyUnicode_GET_SIZE(unicode));
3440
3441 if (!s)
3442 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003443 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003444 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003445 Py_DECREF(s);
3446 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447}
3448
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003449/* --- Unicode Internal Codec ------------------------------------------- */
3450
3451PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003452 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003453 const char *errors)
3454{
3455 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003456 Py_ssize_t startinpos;
3457 Py_ssize_t endinpos;
3458 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003459 PyUnicodeObject *v;
3460 Py_UNICODE *p;
3461 const char *end;
3462 const char *reason;
3463 PyObject *errorHandler = NULL;
3464 PyObject *exc = NULL;
3465
Neal Norwitzd43069c2006-01-08 01:12:10 +00003466#ifdef Py_UNICODE_WIDE
3467 Py_UNICODE unimax = PyUnicode_GetMax();
3468#endif
3469
Thomas Wouters89f507f2006-12-13 04:49:30 +00003470 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003471 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3472 if (v == NULL)
3473 goto onError;
3474 if (PyUnicode_GetSize((PyObject *)v) == 0)
3475 return (PyObject *)v;
3476 p = PyUnicode_AS_UNICODE(v);
3477 end = s + size;
3478
3479 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003480 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003481 /* We have to sanity check the raw data, otherwise doom looms for
3482 some malformed UCS-4 data. */
3483 if (
3484 #ifdef Py_UNICODE_WIDE
3485 *p > unimax || *p < 0 ||
3486 #endif
3487 end-s < Py_UNICODE_SIZE
3488 )
3489 {
3490 startinpos = s - starts;
3491 if (end-s < Py_UNICODE_SIZE) {
3492 endinpos = end-starts;
3493 reason = "truncated input";
3494 }
3495 else {
3496 endinpos = s - starts + Py_UNICODE_SIZE;
3497 reason = "illegal code point (> 0x10FFFF)";
3498 }
3499 outpos = p - PyUnicode_AS_UNICODE(v);
3500 if (unicode_decode_call_errorhandler(
3501 errors, &errorHandler,
3502 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003503 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003504 (PyObject **)&v, &outpos, &p)) {
3505 goto onError;
3506 }
3507 }
3508 else {
3509 p++;
3510 s += Py_UNICODE_SIZE;
3511 }
3512 }
3513
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003514 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003515 goto onError;
3516 Py_XDECREF(errorHandler);
3517 Py_XDECREF(exc);
3518 return (PyObject *)v;
3519
3520 onError:
3521 Py_XDECREF(v);
3522 Py_XDECREF(errorHandler);
3523 Py_XDECREF(exc);
3524 return NULL;
3525}
3526
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527/* --- Latin-1 Codec ------------------------------------------------------ */
3528
3529PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003530 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 const char *errors)
3532{
3533 PyUnicodeObject *v;
3534 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003535
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003537 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003538 Py_UNICODE r = *(unsigned char*)s;
3539 return PyUnicode_FromUnicode(&r, 1);
3540 }
3541
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 v = _PyUnicode_New(size);
3543 if (v == NULL)
3544 goto onError;
3545 if (size == 0)
3546 return (PyObject *)v;
3547 p = PyUnicode_AS_UNICODE(v);
3548 while (size-- > 0)
3549 *p++ = (unsigned char)*s++;
3550 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003551
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 onError:
3553 Py_XDECREF(v);
3554 return NULL;
3555}
3556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557/* create or adjust a UnicodeEncodeError */
3558static void make_encode_exception(PyObject **exceptionObject,
3559 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003560 const Py_UNICODE *unicode, Py_ssize_t size,
3561 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 if (*exceptionObject == NULL) {
3565 *exceptionObject = PyUnicodeEncodeError_Create(
3566 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 }
3568 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3570 goto onError;
3571 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3572 goto onError;
3573 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3574 goto onError;
3575 return;
3576 onError:
3577 Py_DECREF(*exceptionObject);
3578 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 }
3580}
3581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582/* raises a UnicodeEncodeError */
3583static void raise_encode_exception(PyObject **exceptionObject,
3584 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003585 const Py_UNICODE *unicode, Py_ssize_t size,
3586 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 const char *reason)
3588{
3589 make_encode_exception(exceptionObject,
3590 encoding, unicode, size, startpos, endpos, reason);
3591 if (*exceptionObject != NULL)
3592 PyCodec_StrictErrors(*exceptionObject);
3593}
3594
3595/* error handling callback helper:
3596 build arguments, call the callback and check the arguments,
3597 put the result into newpos and return the replacement string, which
3598 has to be freed by the caller */
3599static PyObject *unicode_encode_call_errorhandler(const char *errors,
3600 PyObject **errorHandler,
3601 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003602 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3603 Py_ssize_t startpos, Py_ssize_t endpos,
3604 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003606 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607
3608 PyObject *restuple;
3609 PyObject *resunicode;
3610
3611 if (*errorHandler == NULL) {
3612 *errorHandler = PyCodec_LookupError(errors);
3613 if (*errorHandler == NULL)
3614 return NULL;
3615 }
3616
3617 make_encode_exception(exceptionObject,
3618 encoding, unicode, size, startpos, endpos, reason);
3619 if (*exceptionObject == NULL)
3620 return NULL;
3621
3622 restuple = PyObject_CallFunctionObjArgs(
3623 *errorHandler, *exceptionObject, NULL);
3624 if (restuple == NULL)
3625 return NULL;
3626 if (!PyTuple_Check(restuple)) {
3627 PyErr_Format(PyExc_TypeError, &argparse[4]);
3628 Py_DECREF(restuple);
3629 return NULL;
3630 }
3631 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3632 &resunicode, newpos)) {
3633 Py_DECREF(restuple);
3634 return NULL;
3635 }
3636 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003637 *newpos = size+*newpos;
3638 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003639 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003640 Py_DECREF(restuple);
3641 return NULL;
3642 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 Py_INCREF(resunicode);
3644 Py_DECREF(restuple);
3645 return resunicode;
3646}
3647
3648static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003649 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 const char *errors,
3651 int limit)
3652{
3653 /* output object */
3654 PyObject *res;
3655 /* pointers to the beginning and end+1 of input */
3656 const Py_UNICODE *startp = p;
3657 const Py_UNICODE *endp = p + size;
3658 /* pointer to the beginning of the unencodable characters */
3659 /* const Py_UNICODE *badp = NULL; */
3660 /* pointer into the output */
3661 char *str;
3662 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003663 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003664 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3665 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 PyObject *errorHandler = NULL;
3667 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003668 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 /* the following variable is used for caching string comparisons
3670 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3671 int known_errorHandler = -1;
3672
3673 /* allocate enough for a simple encoding without
3674 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003675 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003676 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003677 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003679 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003680 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 ressize = size;
3682
3683 while (p<endp) {
3684 Py_UNICODE c = *p;
3685
3686 /* can we encode this? */
3687 if (c<limit) {
3688 /* no overflow check, because we know that the space is enough */
3689 *str++ = (char)c;
3690 ++p;
3691 }
3692 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003693 Py_ssize_t unicodepos = p-startp;
3694 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003696 Py_ssize_t repsize;
3697 Py_ssize_t newpos;
3698 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 Py_UNICODE *uni2;
3700 /* startpos for collecting unencodable chars */
3701 const Py_UNICODE *collstart = p;
3702 const Py_UNICODE *collend = p;
3703 /* find all unecodable characters */
3704 while ((collend < endp) && ((*collend)>=limit))
3705 ++collend;
3706 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3707 if (known_errorHandler==-1) {
3708 if ((errors==NULL) || (!strcmp(errors, "strict")))
3709 known_errorHandler = 1;
3710 else if (!strcmp(errors, "replace"))
3711 known_errorHandler = 2;
3712 else if (!strcmp(errors, "ignore"))
3713 known_errorHandler = 3;
3714 else if (!strcmp(errors, "xmlcharrefreplace"))
3715 known_errorHandler = 4;
3716 else
3717 known_errorHandler = 0;
3718 }
3719 switch (known_errorHandler) {
3720 case 1: /* strict */
3721 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3722 goto onError;
3723 case 2: /* replace */
3724 while (collstart++<collend)
3725 *str++ = '?'; /* fall through */
3726 case 3: /* ignore */
3727 p = collend;
3728 break;
3729 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003730 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 /* determine replacement size (temporarily (mis)uses p) */
3732 for (p = collstart, repsize = 0; p < collend; ++p) {
3733 if (*p<10)
3734 repsize += 2+1+1;
3735 else if (*p<100)
3736 repsize += 2+2+1;
3737 else if (*p<1000)
3738 repsize += 2+3+1;
3739 else if (*p<10000)
3740 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003741#ifndef Py_UNICODE_WIDE
3742 else
3743 repsize += 2+5+1;
3744#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 else if (*p<100000)
3746 repsize += 2+5+1;
3747 else if (*p<1000000)
3748 repsize += 2+6+1;
3749 else
3750 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003751#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 }
3753 requiredsize = respos+repsize+(endp-collend);
3754 if (requiredsize > ressize) {
3755 if (requiredsize<2*ressize)
3756 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003757 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003759 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 ressize = requiredsize;
3761 }
3762 /* generate replacement (temporarily (mis)uses p) */
3763 for (p = collstart; p < collend; ++p) {
3764 str += sprintf(str, "&#%d;", (int)*p);
3765 }
3766 p = collend;
3767 break;
3768 default:
3769 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3770 encoding, reason, startp, size, &exc,
3771 collstart-startp, collend-startp, &newpos);
3772 if (repunicode == NULL)
3773 goto onError;
3774 /* need more space? (at least enough for what we
3775 have+the replacement+the rest of the string, so
3776 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003777 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 repsize = PyUnicode_GET_SIZE(repunicode);
3779 requiredsize = respos+repsize+(endp-collend);
3780 if (requiredsize > ressize) {
3781 if (requiredsize<2*ressize)
3782 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003783 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 Py_DECREF(repunicode);
3785 goto onError;
3786 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003787 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 ressize = requiredsize;
3789 }
3790 /* check if there is anything unencodable in the replacement
3791 and copy it to the output */
3792 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3793 c = *uni2;
3794 if (c >= limit) {
3795 raise_encode_exception(&exc, encoding, startp, size,
3796 unicodepos, unicodepos+1, reason);
3797 Py_DECREF(repunicode);
3798 goto onError;
3799 }
3800 *str = (char)c;
3801 }
3802 p = startp + newpos;
3803 Py_DECREF(repunicode);
3804 }
3805 }
3806 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003807 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003808 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003809 onError:
3810 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 Py_XDECREF(errorHandler);
3812 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003813 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814}
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003817 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 const char *errors)
3819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821}
3822
3823PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3824{
3825 if (!PyUnicode_Check(unicode)) {
3826 PyErr_BadArgument();
3827 return NULL;
3828 }
3829 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3830 PyUnicode_GET_SIZE(unicode),
3831 NULL);
3832}
3833
3834/* --- 7-bit ASCII Codec -------------------------------------------------- */
3835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003837 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 const char *errors)
3839{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 PyUnicodeObject *v;
3842 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003843 Py_ssize_t startinpos;
3844 Py_ssize_t endinpos;
3845 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003846 const char *e;
3847 PyObject *errorHandler = NULL;
3848 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003849
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003851 if (size == 1 && *(unsigned char*)s < 128) {
3852 Py_UNICODE r = *(unsigned char*)s;
3853 return PyUnicode_FromUnicode(&r, 1);
3854 }
Tim Petersced69f82003-09-16 20:30:58 +00003855
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 v = _PyUnicode_New(size);
3857 if (v == NULL)
3858 goto onError;
3859 if (size == 0)
3860 return (PyObject *)v;
3861 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 e = s + size;
3863 while (s < e) {
3864 register unsigned char c = (unsigned char)*s;
3865 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 ++s;
3868 }
3869 else {
3870 startinpos = s-starts;
3871 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003872 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873 if (unicode_decode_call_errorhandler(
3874 errors, &errorHandler,
3875 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003876 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003881 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003882 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003883 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003884 Py_XDECREF(errorHandler);
3885 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003887
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 onError:
3889 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 Py_XDECREF(errorHandler);
3891 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 return NULL;
3893}
3894
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003896 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 const char *errors)
3898{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900}
3901
3902PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3903{
3904 if (!PyUnicode_Check(unicode)) {
3905 PyErr_BadArgument();
3906 return NULL;
3907 }
3908 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3909 PyUnicode_GET_SIZE(unicode),
3910 NULL);
3911}
3912
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003913#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003914
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003915/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003916
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003917#if SIZEOF_INT < SIZEOF_SSIZE_T
3918#define NEED_RETRY
3919#endif
3920
3921/* XXX This code is limited to "true" double-byte encodings, as
3922 a) it assumes an incomplete character consists of a single byte, and
3923 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3924 encodings, see IsDBCSLeadByteEx documentation. */
3925
3926static int is_dbcs_lead_byte(const char *s, int offset)
3927{
3928 const char *curr = s + offset;
3929
3930 if (IsDBCSLeadByte(*curr)) {
3931 const char *prev = CharPrev(s, curr);
3932 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3933 }
3934 return 0;
3935}
3936
3937/*
3938 * Decode MBCS string into unicode object. If 'final' is set, converts
3939 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3940 */
3941static int decode_mbcs(PyUnicodeObject **v,
3942 const char *s, /* MBCS string */
3943 int size, /* sizeof MBCS string */
3944 int final)
3945{
3946 Py_UNICODE *p;
3947 Py_ssize_t n = 0;
3948 int usize = 0;
3949
3950 assert(size >= 0);
3951
3952 /* Skip trailing lead-byte unless 'final' is set */
3953 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3954 --size;
3955
3956 /* First get the size of the result */
3957 if (size > 0) {
3958 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3959 if (usize == 0) {
3960 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3961 return -1;
3962 }
3963 }
3964
3965 if (*v == NULL) {
3966 /* Create unicode object */
3967 *v = _PyUnicode_New(usize);
3968 if (*v == NULL)
3969 return -1;
3970 }
3971 else {
3972 /* Extend unicode object */
3973 n = PyUnicode_GET_SIZE(*v);
3974 if (_PyUnicode_Resize(v, n + usize) < 0)
3975 return -1;
3976 }
3977
3978 /* Do the conversion */
3979 if (size > 0) {
3980 p = PyUnicode_AS_UNICODE(*v) + n;
3981 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3982 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3983 return -1;
3984 }
3985 }
3986
3987 return size;
3988}
3989
3990PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3991 Py_ssize_t size,
3992 const char *errors,
3993 Py_ssize_t *consumed)
3994{
3995 PyUnicodeObject *v = NULL;
3996 int done;
3997
3998 if (consumed)
3999 *consumed = 0;
4000
4001#ifdef NEED_RETRY
4002 retry:
4003 if (size > INT_MAX)
4004 done = decode_mbcs(&v, s, INT_MAX, 0);
4005 else
4006#endif
4007 done = decode_mbcs(&v, s, (int)size, !consumed);
4008
4009 if (done < 0) {
4010 Py_XDECREF(v);
4011 return NULL;
4012 }
4013
4014 if (consumed)
4015 *consumed += done;
4016
4017#ifdef NEED_RETRY
4018 if (size > INT_MAX) {
4019 s += done;
4020 size -= done;
4021 goto retry;
4022 }
4023#endif
4024
4025 return (PyObject *)v;
4026}
4027
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004028PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004029 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004030 const char *errors)
4031{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004032 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4033}
4034
4035/*
4036 * Convert unicode into string object (MBCS).
4037 * Returns 0 if succeed, -1 otherwise.
4038 */
4039static int encode_mbcs(PyObject **repr,
4040 const Py_UNICODE *p, /* unicode */
4041 int size) /* size of unicode */
4042{
4043 int mbcssize = 0;
4044 Py_ssize_t n = 0;
4045
4046 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004047
4048 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004049 if (size > 0) {
4050 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4051 if (mbcssize == 0) {
4052 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4053 return -1;
4054 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004055 }
4056
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004057 if (*repr == NULL) {
4058 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004059 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004060 if (*repr == NULL)
4061 return -1;
4062 }
4063 else {
4064 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004065 n = PyBytes_Size(*repr);
4066 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004067 return -1;
4068 }
4069
4070 /* Do the conversion */
4071 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004072 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004073 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4074 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4075 return -1;
4076 }
4077 }
4078
4079 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004080}
4081
4082PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004083 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004084 const char *errors)
4085{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004086 PyObject *repr = NULL;
4087 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004088
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004089#ifdef NEED_RETRY
4090 retry:
4091 if (size > INT_MAX)
4092 ret = encode_mbcs(&repr, p, INT_MAX);
4093 else
4094#endif
4095 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004096
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004097 if (ret < 0) {
4098 Py_XDECREF(repr);
4099 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004100 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004101
4102#ifdef NEED_RETRY
4103 if (size > INT_MAX) {
4104 p += INT_MAX;
4105 size -= INT_MAX;
4106 goto retry;
4107 }
4108#endif
4109
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004110 return repr;
4111}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004112
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004113PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4114{
4115 if (!PyUnicode_Check(unicode)) {
4116 PyErr_BadArgument();
4117 return NULL;
4118 }
4119 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4120 PyUnicode_GET_SIZE(unicode),
4121 NULL);
4122}
4123
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004124#undef NEED_RETRY
4125
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004126#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004127
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128/* --- Character Mapping Codec -------------------------------------------- */
4129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004131 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 PyObject *mapping,
4133 const char *errors)
4134{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004136 Py_ssize_t startinpos;
4137 Py_ssize_t endinpos;
4138 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 PyUnicodeObject *v;
4141 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004142 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 PyObject *errorHandler = NULL;
4144 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004145 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004146 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004147
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 /* Default to Latin-1 */
4149 if (mapping == NULL)
4150 return PyUnicode_DecodeLatin1(s, size, errors);
4151
4152 v = _PyUnicode_New(size);
4153 if (v == NULL)
4154 goto onError;
4155 if (size == 0)
4156 return (PyObject *)v;
4157 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004159 if (PyUnicode_CheckExact(mapping)) {
4160 mapstring = PyUnicode_AS_UNICODE(mapping);
4161 maplen = PyUnicode_GET_SIZE(mapping);
4162 while (s < e) {
4163 unsigned char ch = *s;
4164 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004166 if (ch < maplen)
4167 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004169 if (x == 0xfffe) {
4170 /* undefined mapping */
4171 outpos = p-PyUnicode_AS_UNICODE(v);
4172 startinpos = s-starts;
4173 endinpos = startinpos+1;
4174 if (unicode_decode_call_errorhandler(
4175 errors, &errorHandler,
4176 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004177 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004178 (PyObject **)&v, &outpos, &p)) {
4179 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004180 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004181 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004182 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004183 *p++ = x;
4184 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004186 }
4187 else {
4188 while (s < e) {
4189 unsigned char ch = *s;
4190 PyObject *w, *x;
4191
4192 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004193 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004194 if (w == NULL)
4195 goto onError;
4196 x = PyObject_GetItem(mapping, w);
4197 Py_DECREF(w);
4198 if (x == NULL) {
4199 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4200 /* No mapping found means: mapping is undefined. */
4201 PyErr_Clear();
4202 x = Py_None;
4203 Py_INCREF(x);
4204 } else
4205 goto onError;
4206 }
4207
4208 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004209 if (PyLong_Check(x)) {
4210 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004211 if (value < 0 || value > 65535) {
4212 PyErr_SetString(PyExc_TypeError,
4213 "character mapping must be in range(65536)");
4214 Py_DECREF(x);
4215 goto onError;
4216 }
4217 *p++ = (Py_UNICODE)value;
4218 }
4219 else if (x == Py_None) {
4220 /* undefined mapping */
4221 outpos = p-PyUnicode_AS_UNICODE(v);
4222 startinpos = s-starts;
4223 endinpos = startinpos+1;
4224 if (unicode_decode_call_errorhandler(
4225 errors, &errorHandler,
4226 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004227 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004228 (PyObject **)&v, &outpos, &p)) {
4229 Py_DECREF(x);
4230 goto onError;
4231 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004232 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004233 continue;
4234 }
4235 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004236 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004237
4238 if (targetsize == 1)
4239 /* 1-1 mapping */
4240 *p++ = *PyUnicode_AS_UNICODE(x);
4241
4242 else if (targetsize > 1) {
4243 /* 1-n mapping */
4244 if (targetsize > extrachars) {
4245 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004246 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4247 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004248 (targetsize << 2);
4249 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004250 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004251 if (_PyUnicode_Resize(&v,
4252 PyUnicode_GET_SIZE(v) + needed) < 0) {
4253 Py_DECREF(x);
4254 goto onError;
4255 }
4256 p = PyUnicode_AS_UNICODE(v) + oldpos;
4257 }
4258 Py_UNICODE_COPY(p,
4259 PyUnicode_AS_UNICODE(x),
4260 targetsize);
4261 p += targetsize;
4262 extrachars -= targetsize;
4263 }
4264 /* 1-0 mapping: skip the character */
4265 }
4266 else {
4267 /* wrong return value */
4268 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004269 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004270 Py_DECREF(x);
4271 goto onError;
4272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004274 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 }
4277 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004278 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 Py_XDECREF(errorHandler);
4281 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004283
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 Py_XDECREF(errorHandler);
4286 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 Py_XDECREF(v);
4288 return NULL;
4289}
4290
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004291/* Charmap encoding: the lookup table */
4292
4293struct encoding_map{
4294 PyObject_HEAD
4295 unsigned char level1[32];
4296 int count2, count3;
4297 unsigned char level23[1];
4298};
4299
4300static PyObject*
4301encoding_map_size(PyObject *obj, PyObject* args)
4302{
4303 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004304 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004305 128*map->count3);
4306}
4307
4308static PyMethodDef encoding_map_methods[] = {
4309 {"size", encoding_map_size, METH_NOARGS,
4310 PyDoc_STR("Return the size (in bytes) of this object") },
4311 { 0 }
4312};
4313
4314static void
4315encoding_map_dealloc(PyObject* o)
4316{
4317 PyObject_FREE(o);
4318}
4319
4320static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004321 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004322 "EncodingMap", /*tp_name*/
4323 sizeof(struct encoding_map), /*tp_basicsize*/
4324 0, /*tp_itemsize*/
4325 /* methods */
4326 encoding_map_dealloc, /*tp_dealloc*/
4327 0, /*tp_print*/
4328 0, /*tp_getattr*/
4329 0, /*tp_setattr*/
4330 0, /*tp_compare*/
4331 0, /*tp_repr*/
4332 0, /*tp_as_number*/
4333 0, /*tp_as_sequence*/
4334 0, /*tp_as_mapping*/
4335 0, /*tp_hash*/
4336 0, /*tp_call*/
4337 0, /*tp_str*/
4338 0, /*tp_getattro*/
4339 0, /*tp_setattro*/
4340 0, /*tp_as_buffer*/
4341 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4342 0, /*tp_doc*/
4343 0, /*tp_traverse*/
4344 0, /*tp_clear*/
4345 0, /*tp_richcompare*/
4346 0, /*tp_weaklistoffset*/
4347 0, /*tp_iter*/
4348 0, /*tp_iternext*/
4349 encoding_map_methods, /*tp_methods*/
4350 0, /*tp_members*/
4351 0, /*tp_getset*/
4352 0, /*tp_base*/
4353 0, /*tp_dict*/
4354 0, /*tp_descr_get*/
4355 0, /*tp_descr_set*/
4356 0, /*tp_dictoffset*/
4357 0, /*tp_init*/
4358 0, /*tp_alloc*/
4359 0, /*tp_new*/
4360 0, /*tp_free*/
4361 0, /*tp_is_gc*/
4362};
4363
4364PyObject*
4365PyUnicode_BuildEncodingMap(PyObject* string)
4366{
4367 Py_UNICODE *decode;
4368 PyObject *result;
4369 struct encoding_map *mresult;
4370 int i;
4371 int need_dict = 0;
4372 unsigned char level1[32];
4373 unsigned char level2[512];
4374 unsigned char *mlevel1, *mlevel2, *mlevel3;
4375 int count2 = 0, count3 = 0;
4376
4377 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4378 PyErr_BadArgument();
4379 return NULL;
4380 }
4381 decode = PyUnicode_AS_UNICODE(string);
4382 memset(level1, 0xFF, sizeof level1);
4383 memset(level2, 0xFF, sizeof level2);
4384
4385 /* If there isn't a one-to-one mapping of NULL to \0,
4386 or if there are non-BMP characters, we need to use
4387 a mapping dictionary. */
4388 if (decode[0] != 0)
4389 need_dict = 1;
4390 for (i = 1; i < 256; i++) {
4391 int l1, l2;
4392 if (decode[i] == 0
4393 #ifdef Py_UNICODE_WIDE
4394 || decode[i] > 0xFFFF
4395 #endif
4396 ) {
4397 need_dict = 1;
4398 break;
4399 }
4400 if (decode[i] == 0xFFFE)
4401 /* unmapped character */
4402 continue;
4403 l1 = decode[i] >> 11;
4404 l2 = decode[i] >> 7;
4405 if (level1[l1] == 0xFF)
4406 level1[l1] = count2++;
4407 if (level2[l2] == 0xFF)
4408 level2[l2] = count3++;
4409 }
4410
4411 if (count2 >= 0xFF || count3 >= 0xFF)
4412 need_dict = 1;
4413
4414 if (need_dict) {
4415 PyObject *result = PyDict_New();
4416 PyObject *key, *value;
4417 if (!result)
4418 return NULL;
4419 for (i = 0; i < 256; i++) {
4420 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004421 key = PyLong_FromLong(decode[i]);
4422 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004423 if (!key || !value)
4424 goto failed1;
4425 if (PyDict_SetItem(result, key, value) == -1)
4426 goto failed1;
4427 Py_DECREF(key);
4428 Py_DECREF(value);
4429 }
4430 return result;
4431 failed1:
4432 Py_XDECREF(key);
4433 Py_XDECREF(value);
4434 Py_DECREF(result);
4435 return NULL;
4436 }
4437
4438 /* Create a three-level trie */
4439 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4440 16*count2 + 128*count3 - 1);
4441 if (!result)
4442 return PyErr_NoMemory();
4443 PyObject_Init(result, &EncodingMapType);
4444 mresult = (struct encoding_map*)result;
4445 mresult->count2 = count2;
4446 mresult->count3 = count3;
4447 mlevel1 = mresult->level1;
4448 mlevel2 = mresult->level23;
4449 mlevel3 = mresult->level23 + 16*count2;
4450 memcpy(mlevel1, level1, 32);
4451 memset(mlevel2, 0xFF, 16*count2);
4452 memset(mlevel3, 0, 128*count3);
4453 count3 = 0;
4454 for (i = 1; i < 256; i++) {
4455 int o1, o2, o3, i2, i3;
4456 if (decode[i] == 0xFFFE)
4457 /* unmapped character */
4458 continue;
4459 o1 = decode[i]>>11;
4460 o2 = (decode[i]>>7) & 0xF;
4461 i2 = 16*mlevel1[o1] + o2;
4462 if (mlevel2[i2] == 0xFF)
4463 mlevel2[i2] = count3++;
4464 o3 = decode[i] & 0x7F;
4465 i3 = 128*mlevel2[i2] + o3;
4466 mlevel3[i3] = i;
4467 }
4468 return result;
4469}
4470
4471static int
4472encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4473{
4474 struct encoding_map *map = (struct encoding_map*)mapping;
4475 int l1 = c>>11;
4476 int l2 = (c>>7) & 0xF;
4477 int l3 = c & 0x7F;
4478 int i;
4479
4480#ifdef Py_UNICODE_WIDE
4481 if (c > 0xFFFF) {
4482 return -1;
4483 }
4484#endif
4485 if (c == 0)
4486 return 0;
4487 /* level 1*/
4488 i = map->level1[l1];
4489 if (i == 0xFF) {
4490 return -1;
4491 }
4492 /* level 2*/
4493 i = map->level23[16*i+l2];
4494 if (i == 0xFF) {
4495 return -1;
4496 }
4497 /* level 3 */
4498 i = map->level23[16*map->count2 + 128*i + l3];
4499 if (i == 0) {
4500 return -1;
4501 }
4502 return i;
4503}
4504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505/* Lookup the character ch in the mapping. If the character
4506 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004507 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509{
Christian Heimes217cfd12007-12-02 14:31:20 +00004510 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 PyObject *x;
4512
4513 if (w == NULL)
4514 return NULL;
4515 x = PyObject_GetItem(mapping, w);
4516 Py_DECREF(w);
4517 if (x == NULL) {
4518 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4519 /* No mapping found means: mapping is undefined. */
4520 PyErr_Clear();
4521 x = Py_None;
4522 Py_INCREF(x);
4523 return x;
4524 } else
4525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004527 else if (x == Py_None)
4528 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004529 else if (PyLong_Check(x)) {
4530 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 if (value < 0 || value > 255) {
4532 PyErr_SetString(PyExc_TypeError,
4533 "character mapping must be in range(256)");
4534 Py_DECREF(x);
4535 return NULL;
4536 }
4537 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004539 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004543 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004544 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004545 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 Py_DECREF(x);
4547 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 }
4549}
4550
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004551static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004552charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004553{
Christian Heimes72b710a2008-05-26 13:28:38 +00004554 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004555 /* exponentially overallocate to minimize reallocations */
4556 if (requiredsize < 2*outsize)
4557 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004558 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004559 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004560 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004561}
4562
4563typedef enum charmapencode_result {
4564 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4565}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004567 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 space is available. Return a new reference to the object that
4569 was put in the output buffer, or Py_None, if the mapping was undefined
4570 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004571 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004573charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004574 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004576 PyObject *rep;
4577 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004578 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579
Christian Heimes90aa7642007-12-19 02:45:37 +00004580 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004581 int res = encoding_map_lookup(c, mapping);
4582 Py_ssize_t requiredsize = *outpos+1;
4583 if (res == -1)
4584 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004585 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004586 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004587 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004588 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004589 outstart[(*outpos)++] = (char)res;
4590 return enc_SUCCESS;
4591 }
4592
4593 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004595 return enc_EXCEPTION;
4596 else if (rep==Py_None) {
4597 Py_DECREF(rep);
4598 return enc_FAILED;
4599 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004600 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004601 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004602 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004603 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004605 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004607 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004608 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 }
4610 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004611 const char *repchars = PyBytes_AS_STRING(rep);
4612 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004613 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004614 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004615 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004617 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004619 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 memcpy(outstart + *outpos, repchars, repsize);
4621 *outpos += repsize;
4622 }
4623 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004624 Py_DECREF(rep);
4625 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626}
4627
4628/* handle an error in PyUnicode_EncodeCharmap
4629 Return 0 on success, -1 on error */
4630static
4631int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004632 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004634 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004635 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636{
4637 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 Py_ssize_t repsize;
4639 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 Py_UNICODE *uni2;
4641 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004642 Py_ssize_t collstartpos = *inpos;
4643 Py_ssize_t collendpos = *inpos+1;
4644 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 char *encoding = "charmap";
4646 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004647 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 /* find all unencodable characters */
4650 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004651 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004652 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004653 int res = encoding_map_lookup(p[collendpos], mapping);
4654 if (res != -1)
4655 break;
4656 ++collendpos;
4657 continue;
4658 }
4659
4660 rep = charmapencode_lookup(p[collendpos], mapping);
4661 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004663 else if (rep!=Py_None) {
4664 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 break;
4666 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004667 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 ++collendpos;
4669 }
4670 /* cache callback name lookup
4671 * (if not done yet, i.e. it's the first error) */
4672 if (*known_errorHandler==-1) {
4673 if ((errors==NULL) || (!strcmp(errors, "strict")))
4674 *known_errorHandler = 1;
4675 else if (!strcmp(errors, "replace"))
4676 *known_errorHandler = 2;
4677 else if (!strcmp(errors, "ignore"))
4678 *known_errorHandler = 3;
4679 else if (!strcmp(errors, "xmlcharrefreplace"))
4680 *known_errorHandler = 4;
4681 else
4682 *known_errorHandler = 0;
4683 }
4684 switch (*known_errorHandler) {
4685 case 1: /* strict */
4686 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4687 return -1;
4688 case 2: /* replace */
4689 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4690 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004691 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 return -1;
4693 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004694 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4696 return -1;
4697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 }
4699 /* fall through */
4700 case 3: /* ignore */
4701 *inpos = collendpos;
4702 break;
4703 case 4: /* xmlcharrefreplace */
4704 /* generate replacement (temporarily (mis)uses p) */
4705 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4706 char buffer[2+29+1+1];
4707 char *cp;
4708 sprintf(buffer, "&#%d;", (int)p[collpos]);
4709 for (cp = buffer; *cp; ++cp) {
4710 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004711 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004713 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4715 return -1;
4716 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 }
4718 }
4719 *inpos = collendpos;
4720 break;
4721 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004722 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 encoding, reason, p, size, exceptionObject,
4724 collstartpos, collendpos, &newpos);
4725 if (repunicode == NULL)
4726 return -1;
4727 /* generate replacement */
4728 repsize = PyUnicode_GET_SIZE(repunicode);
4729 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4730 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004731 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 return -1;
4733 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004734 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4737 return -1;
4738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 }
4740 *inpos = newpos;
4741 Py_DECREF(repunicode);
4742 }
4743 return 0;
4744}
4745
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 PyObject *mapping,
4749 const char *errors)
4750{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 /* output object */
4752 PyObject *res = NULL;
4753 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004754 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004756 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 PyObject *errorHandler = NULL;
4758 PyObject *exc = NULL;
4759 /* the following variable is used for caching string comparisons
4760 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4761 * 3=ignore, 4=xmlcharrefreplace */
4762 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763
4764 /* Default to Latin-1 */
4765 if (mapping == NULL)
4766 return PyUnicode_EncodeLatin1(p, size, errors);
4767
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 /* allocate enough for a simple encoding without
4769 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004770 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 if (res == NULL)
4772 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004773 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 while (inpos<size) {
4777 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004778 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004779 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004781 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 if (charmap_encoding_error(p, size, &inpos, mapping,
4783 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004784 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004785 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004786 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004787 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 else
4790 /* done with this character => adjust input position */
4791 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004795 if (respos<PyBytes_GET_SIZE(res))
4796 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004797
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 Py_XDECREF(exc);
4799 Py_XDECREF(errorHandler);
4800 return res;
4801
4802 onError:
4803 Py_XDECREF(res);
4804 Py_XDECREF(exc);
4805 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 return NULL;
4807}
4808
4809PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4810 PyObject *mapping)
4811{
4812 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4813 PyErr_BadArgument();
4814 return NULL;
4815 }
4816 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4817 PyUnicode_GET_SIZE(unicode),
4818 mapping,
4819 NULL);
4820}
4821
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822/* create or adjust a UnicodeTranslateError */
4823static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004824 const Py_UNICODE *unicode, Py_ssize_t size,
4825 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 if (*exceptionObject == NULL) {
4829 *exceptionObject = PyUnicodeTranslateError_Create(
4830 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
4832 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4834 goto onError;
4835 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4836 goto onError;
4837 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4838 goto onError;
4839 return;
4840 onError:
4841 Py_DECREF(*exceptionObject);
4842 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 }
4844}
4845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846/* raises a UnicodeTranslateError */
4847static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004848 const Py_UNICODE *unicode, Py_ssize_t size,
4849 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 const char *reason)
4851{
4852 make_translate_exception(exceptionObject,
4853 unicode, size, startpos, endpos, reason);
4854 if (*exceptionObject != NULL)
4855 PyCodec_StrictErrors(*exceptionObject);
4856}
4857
4858/* error handling callback helper:
4859 build arguments, call the callback and check the arguments,
4860 put the result into newpos and return the replacement string, which
4861 has to be freed by the caller */
4862static PyObject *unicode_translate_call_errorhandler(const char *errors,
4863 PyObject **errorHandler,
4864 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004865 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4866 Py_ssize_t startpos, Py_ssize_t endpos,
4867 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004869 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004871 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 PyObject *restuple;
4873 PyObject *resunicode;
4874
4875 if (*errorHandler == NULL) {
4876 *errorHandler = PyCodec_LookupError(errors);
4877 if (*errorHandler == NULL)
4878 return NULL;
4879 }
4880
4881 make_translate_exception(exceptionObject,
4882 unicode, size, startpos, endpos, reason);
4883 if (*exceptionObject == NULL)
4884 return NULL;
4885
4886 restuple = PyObject_CallFunctionObjArgs(
4887 *errorHandler, *exceptionObject, NULL);
4888 if (restuple == NULL)
4889 return NULL;
4890 if (!PyTuple_Check(restuple)) {
4891 PyErr_Format(PyExc_TypeError, &argparse[4]);
4892 Py_DECREF(restuple);
4893 return NULL;
4894 }
4895 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 Py_DECREF(restuple);
4898 return NULL;
4899 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004900 if (i_newpos<0)
4901 *newpos = size+i_newpos;
4902 else
4903 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004904 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004905 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004906 Py_DECREF(restuple);
4907 return NULL;
4908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909 Py_INCREF(resunicode);
4910 Py_DECREF(restuple);
4911 return resunicode;
4912}
4913
4914/* Lookup the character ch in the mapping and put the result in result,
4915 which must be decrefed by the caller.
4916 Return 0 on success, -1 on error */
4917static
4918int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4919{
Christian Heimes217cfd12007-12-02 14:31:20 +00004920 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921 PyObject *x;
4922
4923 if (w == NULL)
4924 return -1;
4925 x = PyObject_GetItem(mapping, w);
4926 Py_DECREF(w);
4927 if (x == NULL) {
4928 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4929 /* No mapping found means: use 1:1 mapping. */
4930 PyErr_Clear();
4931 *result = NULL;
4932 return 0;
4933 } else
4934 return -1;
4935 }
4936 else if (x == Py_None) {
4937 *result = x;
4938 return 0;
4939 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004940 else if (PyLong_Check(x)) {
4941 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942 long max = PyUnicode_GetMax();
4943 if (value < 0 || value > max) {
4944 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004945 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 Py_DECREF(x);
4947 return -1;
4948 }
4949 *result = x;
4950 return 0;
4951 }
4952 else if (PyUnicode_Check(x)) {
4953 *result = x;
4954 return 0;
4955 }
4956 else {
4957 /* wrong return value */
4958 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004959 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00004960 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 return -1;
4962 }
4963}
4964/* ensure that *outobj is at least requiredsize characters long,
4965if not reallocate and adjust various state variables.
4966Return 0 on success, -1 on error */
4967static
Walter Dörwald4894c302003-10-24 14:25:28 +00004968int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004969 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004971 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004972 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004974 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004976 if (requiredsize < 2 * oldsize)
4977 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004978 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979 return -1;
4980 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981 }
4982 return 0;
4983}
4984/* lookup the character, put the result in the output string and adjust
4985 various state variables. Return a new reference to the object that
4986 was put in the output buffer in *result, or Py_None, if the mapping was
4987 undefined (in which case no character was written).
4988 The called must decref result.
4989 Return 0 on success, -1 on error. */
4990static
Walter Dörwald4894c302003-10-24 14:25:28 +00004991int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004992 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004993 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994{
Walter Dörwald4894c302003-10-24 14:25:28 +00004995 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 return -1;
4997 if (*res==NULL) {
4998 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004999 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 }
5001 else if (*res==Py_None)
5002 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005003 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005004 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005005 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 }
5007 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005008 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005009 if (repsize==1) {
5010 /* no overflow check, because we know that the space is enough */
5011 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5012 }
5013 else if (repsize!=0) {
5014 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005015 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005016 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005017 repsize - 1;
5018 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 return -1;
5020 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5021 *outp += repsize;
5022 }
5023 }
5024 else
5025 return -1;
5026 return 0;
5027}
5028
5029PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005030 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031 PyObject *mapping,
5032 const char *errors)
5033{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005034 /* output object */
5035 PyObject *res = NULL;
5036 /* pointers to the beginning and end+1 of input */
5037 const Py_UNICODE *startp = p;
5038 const Py_UNICODE *endp = p + size;
5039 /* pointer into the output */
5040 Py_UNICODE *str;
5041 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005042 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043 char *reason = "character maps to <undefined>";
5044 PyObject *errorHandler = NULL;
5045 PyObject *exc = NULL;
5046 /* the following variable is used for caching string comparisons
5047 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5048 * 3=ignore, 4=xmlcharrefreplace */
5049 int known_errorHandler = -1;
5050
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051 if (mapping == NULL) {
5052 PyErr_BadArgument();
5053 return NULL;
5054 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005055
5056 /* allocate enough for a simple 1:1 translation without
5057 replacements, if we need more, we'll resize */
5058 res = PyUnicode_FromUnicode(NULL, size);
5059 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005062 return res;
5063 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 while (p<endp) {
5066 /* try to encode it */
5067 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005068 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 goto onError;
5071 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005072 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005073 if (x!=Py_None) /* it worked => adjust input pointer */
5074 ++p;
5075 else { /* untranslatable character */
5076 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005077 Py_ssize_t repsize;
5078 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 Py_UNICODE *uni2;
5080 /* startpos for collecting untranslatable chars */
5081 const Py_UNICODE *collstart = p;
5082 const Py_UNICODE *collend = p+1;
5083 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005085 /* find all untranslatable characters */
5086 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005087 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088 goto onError;
5089 Py_XDECREF(x);
5090 if (x!=Py_None)
5091 break;
5092 ++collend;
5093 }
5094 /* cache callback name lookup
5095 * (if not done yet, i.e. it's the first error) */
5096 if (known_errorHandler==-1) {
5097 if ((errors==NULL) || (!strcmp(errors, "strict")))
5098 known_errorHandler = 1;
5099 else if (!strcmp(errors, "replace"))
5100 known_errorHandler = 2;
5101 else if (!strcmp(errors, "ignore"))
5102 known_errorHandler = 3;
5103 else if (!strcmp(errors, "xmlcharrefreplace"))
5104 known_errorHandler = 4;
5105 else
5106 known_errorHandler = 0;
5107 }
5108 switch (known_errorHandler) {
5109 case 1: /* strict */
5110 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5111 goto onError;
5112 case 2: /* replace */
5113 /* No need to check for space, this is a 1:1 replacement */
5114 for (coll = collstart; coll<collend; ++coll)
5115 *str++ = '?';
5116 /* fall through */
5117 case 3: /* ignore */
5118 p = collend;
5119 break;
5120 case 4: /* xmlcharrefreplace */
5121 /* generate replacement (temporarily (mis)uses p) */
5122 for (p = collstart; p < collend; ++p) {
5123 char buffer[2+29+1+1];
5124 char *cp;
5125 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005126 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005127 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5128 goto onError;
5129 for (cp = buffer; *cp; ++cp)
5130 *str++ = *cp;
5131 }
5132 p = collend;
5133 break;
5134 default:
5135 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5136 reason, startp, size, &exc,
5137 collstart-startp, collend-startp, &newpos);
5138 if (repunicode == NULL)
5139 goto onError;
5140 /* generate replacement */
5141 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005142 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005143 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5144 Py_DECREF(repunicode);
5145 goto onError;
5146 }
5147 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5148 *str++ = *uni2;
5149 p = startp + newpos;
5150 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 }
5152 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005154 /* Resize if we allocated to much */
5155 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005156 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005157 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005158 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005159 }
5160 Py_XDECREF(exc);
5161 Py_XDECREF(errorHandler);
5162 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 onError:
5165 Py_XDECREF(res);
5166 Py_XDECREF(exc);
5167 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 return NULL;
5169}
5170
5171PyObject *PyUnicode_Translate(PyObject *str,
5172 PyObject *mapping,
5173 const char *errors)
5174{
5175 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005176
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 str = PyUnicode_FromObject(str);
5178 if (str == NULL)
5179 goto onError;
5180 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5181 PyUnicode_GET_SIZE(str),
5182 mapping,
5183 errors);
5184 Py_DECREF(str);
5185 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 onError:
5188 Py_XDECREF(str);
5189 return NULL;
5190}
Tim Petersced69f82003-09-16 20:30:58 +00005191
Guido van Rossum9e896b32000-04-05 20:11:21 +00005192/* --- Decimal Encoder ---------------------------------------------------- */
5193
5194int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005195 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005196 char *output,
5197 const char *errors)
5198{
5199 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 PyObject *errorHandler = NULL;
5201 PyObject *exc = NULL;
5202 const char *encoding = "decimal";
5203 const char *reason = "invalid decimal Unicode string";
5204 /* the following variable is used for caching string comparisons
5205 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5206 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005207
5208 if (output == NULL) {
5209 PyErr_BadArgument();
5210 return -1;
5211 }
5212
5213 p = s;
5214 end = s + length;
5215 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005217 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005219 Py_ssize_t repsize;
5220 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005221 Py_UNICODE *uni2;
5222 Py_UNICODE *collstart;
5223 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005224
Guido van Rossum9e896b32000-04-05 20:11:21 +00005225 if (Py_UNICODE_ISSPACE(ch)) {
5226 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005227 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005228 continue;
5229 }
5230 decimal = Py_UNICODE_TODECIMAL(ch);
5231 if (decimal >= 0) {
5232 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005233 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005234 continue;
5235 }
Guido van Rossumba477042000-04-06 18:18:10 +00005236 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005237 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005239 continue;
5240 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 /* All other characters are considered unencodable */
5242 collstart = p;
5243 collend = p+1;
5244 while (collend < end) {
5245 if ((0 < *collend && *collend < 256) ||
5246 !Py_UNICODE_ISSPACE(*collend) ||
5247 Py_UNICODE_TODECIMAL(*collend))
5248 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005249 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 /* cache callback name lookup
5251 * (if not done yet, i.e. it's the first error) */
5252 if (known_errorHandler==-1) {
5253 if ((errors==NULL) || (!strcmp(errors, "strict")))
5254 known_errorHandler = 1;
5255 else if (!strcmp(errors, "replace"))
5256 known_errorHandler = 2;
5257 else if (!strcmp(errors, "ignore"))
5258 known_errorHandler = 3;
5259 else if (!strcmp(errors, "xmlcharrefreplace"))
5260 known_errorHandler = 4;
5261 else
5262 known_errorHandler = 0;
5263 }
5264 switch (known_errorHandler) {
5265 case 1: /* strict */
5266 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5267 goto onError;
5268 case 2: /* replace */
5269 for (p = collstart; p < collend; ++p)
5270 *output++ = '?';
5271 /* fall through */
5272 case 3: /* ignore */
5273 p = collend;
5274 break;
5275 case 4: /* xmlcharrefreplace */
5276 /* generate replacement (temporarily (mis)uses p) */
5277 for (p = collstart; p < collend; ++p)
5278 output += sprintf(output, "&#%d;", (int)*p);
5279 p = collend;
5280 break;
5281 default:
5282 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5283 encoding, reason, s, length, &exc,
5284 collstart-s, collend-s, &newpos);
5285 if (repunicode == NULL)
5286 goto onError;
5287 /* generate replacement */
5288 repsize = PyUnicode_GET_SIZE(repunicode);
5289 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5290 Py_UNICODE ch = *uni2;
5291 if (Py_UNICODE_ISSPACE(ch))
5292 *output++ = ' ';
5293 else {
5294 decimal = Py_UNICODE_TODECIMAL(ch);
5295 if (decimal >= 0)
5296 *output++ = '0' + decimal;
5297 else if (0 < ch && ch < 256)
5298 *output++ = (char)ch;
5299 else {
5300 Py_DECREF(repunicode);
5301 raise_encode_exception(&exc, encoding,
5302 s, length, collstart-s, collend-s, reason);
5303 goto onError;
5304 }
5305 }
5306 }
5307 p = s + newpos;
5308 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005309 }
5310 }
5311 /* 0-terminate the output string */
5312 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313 Py_XDECREF(exc);
5314 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005315 return 0;
5316
5317 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005318 Py_XDECREF(exc);
5319 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005320 return -1;
5321}
5322
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323/* --- Helpers ------------------------------------------------------------ */
5324
Eric Smith8c663262007-08-25 02:26:07 +00005325#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005326#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005327#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005328/* Include _ParseTupleFinds from find.h */
5329#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005330#include "stringlib/find.h"
5331#include "stringlib/partition.h"
5332
Eric Smith5807c412008-05-11 21:00:57 +00005333#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5334#include "stringlib/localeutil.h"
5335
Thomas Wouters477c8d52006-05-27 19:21:47 +00005336/* helper macro to fixup start/end slice values */
5337#define FIX_START_END(obj) \
5338 if (start < 0) \
5339 start += (obj)->length; \
5340 if (start < 0) \
5341 start = 0; \
5342 if (end > (obj)->length) \
5343 end = (obj)->length; \
5344 if (end < 0) \
5345 end += (obj)->length; \
5346 if (end < 0) \
5347 end = 0;
5348
Martin v. Löwis18e16552006-02-15 17:27:45 +00005349Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005350 PyObject *substr,
5351 Py_ssize_t start,
5352 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005355 PyUnicodeObject* str_obj;
5356 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005357
Thomas Wouters477c8d52006-05-27 19:21:47 +00005358 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5359 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005361 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5362 if (!sub_obj) {
5363 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 return -1;
5365 }
Tim Petersced69f82003-09-16 20:30:58 +00005366
Thomas Wouters477c8d52006-05-27 19:21:47 +00005367 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005368
Thomas Wouters477c8d52006-05-27 19:21:47 +00005369 result = stringlib_count(
5370 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5371 );
5372
5373 Py_DECREF(sub_obj);
5374 Py_DECREF(str_obj);
5375
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 return result;
5377}
5378
Martin v. Löwis18e16552006-02-15 17:27:45 +00005379Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005380 PyObject *sub,
5381 Py_ssize_t start,
5382 Py_ssize_t end,
5383 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005388 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005389 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005390 sub = PyUnicode_FromObject(sub);
5391 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005392 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005393 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 }
Tim Petersced69f82003-09-16 20:30:58 +00005395
Thomas Wouters477c8d52006-05-27 19:21:47 +00005396 if (direction > 0)
5397 result = stringlib_find_slice(
5398 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5399 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5400 start, end
5401 );
5402 else
5403 result = stringlib_rfind_slice(
5404 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5405 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5406 start, end
5407 );
5408
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 Py_DECREF(sub);
5411
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 return result;
5413}
5414
Tim Petersced69f82003-09-16 20:30:58 +00005415static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416int tailmatch(PyUnicodeObject *self,
5417 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005418 Py_ssize_t start,
5419 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 int direction)
5421{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 if (substring->length == 0)
5423 return 1;
5424
Thomas Wouters477c8d52006-05-27 19:21:47 +00005425 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
5427 end -= substring->length;
5428 if (end < start)
5429 return 0;
5430
5431 if (direction > 0) {
5432 if (Py_UNICODE_MATCH(self, end, substring))
5433 return 1;
5434 } else {
5435 if (Py_UNICODE_MATCH(self, start, substring))
5436 return 1;
5437 }
5438
5439 return 0;
5440}
5441
Martin v. Löwis18e16552006-02-15 17:27:45 +00005442Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005444 Py_ssize_t start,
5445 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 int direction)
5447{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005448 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 str = PyUnicode_FromObject(str);
5451 if (str == NULL)
5452 return -1;
5453 substr = PyUnicode_FromObject(substr);
5454 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005455 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 return -1;
5457 }
Tim Petersced69f82003-09-16 20:30:58 +00005458
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 result = tailmatch((PyUnicodeObject *)str,
5460 (PyUnicodeObject *)substr,
5461 start, end, direction);
5462 Py_DECREF(str);
5463 Py_DECREF(substr);
5464 return result;
5465}
5466
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467/* Apply fixfct filter to the Unicode object self and return a
5468 reference to the modified object */
5469
Tim Petersced69f82003-09-16 20:30:58 +00005470static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471PyObject *fixup(PyUnicodeObject *self,
5472 int (*fixfct)(PyUnicodeObject *s))
5473{
5474
5475 PyUnicodeObject *u;
5476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005477 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 if (u == NULL)
5479 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005480
5481 Py_UNICODE_COPY(u->str, self->str, self->length);
5482
Tim Peters7a29bd52001-09-12 03:03:31 +00005483 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 /* fixfct should return TRUE if it modified the buffer. If
5485 FALSE, return a reference to the original buffer instead
5486 (to save space, not time) */
5487 Py_INCREF(self);
5488 Py_DECREF(u);
5489 return (PyObject*) self;
5490 }
5491 return (PyObject*) u;
5492}
5493
Tim Petersced69f82003-09-16 20:30:58 +00005494static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495int fixupper(PyUnicodeObject *self)
5496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005497 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 Py_UNICODE *s = self->str;
5499 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005500
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 while (len-- > 0) {
5502 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005503
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 ch = Py_UNICODE_TOUPPER(*s);
5505 if (ch != *s) {
5506 status = 1;
5507 *s = ch;
5508 }
5509 s++;
5510 }
5511
5512 return status;
5513}
5514
Tim Petersced69f82003-09-16 20:30:58 +00005515static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516int fixlower(PyUnicodeObject *self)
5517{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005518 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 Py_UNICODE *s = self->str;
5520 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005521
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 while (len-- > 0) {
5523 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 ch = Py_UNICODE_TOLOWER(*s);
5526 if (ch != *s) {
5527 status = 1;
5528 *s = ch;
5529 }
5530 s++;
5531 }
5532
5533 return status;
5534}
5535
Tim Petersced69f82003-09-16 20:30:58 +00005536static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537int fixswapcase(PyUnicodeObject *self)
5538{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005539 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 Py_UNICODE *s = self->str;
5541 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 while (len-- > 0) {
5544 if (Py_UNICODE_ISUPPER(*s)) {
5545 *s = Py_UNICODE_TOLOWER(*s);
5546 status = 1;
5547 } else if (Py_UNICODE_ISLOWER(*s)) {
5548 *s = Py_UNICODE_TOUPPER(*s);
5549 status = 1;
5550 }
5551 s++;
5552 }
5553
5554 return status;
5555}
5556
Tim Petersced69f82003-09-16 20:30:58 +00005557static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558int fixcapitalize(PyUnicodeObject *self)
5559{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005560 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005561 Py_UNICODE *s = self->str;
5562 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005563
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005564 if (len == 0)
5565 return 0;
5566 if (Py_UNICODE_ISLOWER(*s)) {
5567 *s = Py_UNICODE_TOUPPER(*s);
5568 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005570 s++;
5571 while (--len > 0) {
5572 if (Py_UNICODE_ISUPPER(*s)) {
5573 *s = Py_UNICODE_TOLOWER(*s);
5574 status = 1;
5575 }
5576 s++;
5577 }
5578 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579}
5580
5581static
5582int fixtitle(PyUnicodeObject *self)
5583{
5584 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5585 register Py_UNICODE *e;
5586 int previous_is_cased;
5587
5588 /* Shortcut for single character strings */
5589 if (PyUnicode_GET_SIZE(self) == 1) {
5590 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5591 if (*p != ch) {
5592 *p = ch;
5593 return 1;
5594 }
5595 else
5596 return 0;
5597 }
Tim Petersced69f82003-09-16 20:30:58 +00005598
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 e = p + PyUnicode_GET_SIZE(self);
5600 previous_is_cased = 0;
5601 for (; p < e; p++) {
5602 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005603
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 if (previous_is_cased)
5605 *p = Py_UNICODE_TOLOWER(ch);
5606 else
5607 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005608
5609 if (Py_UNICODE_ISLOWER(ch) ||
5610 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 Py_UNICODE_ISTITLE(ch))
5612 previous_is_cased = 1;
5613 else
5614 previous_is_cased = 0;
5615 }
5616 return 1;
5617}
5618
Tim Peters8ce9f162004-08-27 01:49:32 +00005619PyObject *
5620PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621{
Skip Montanaro6543b452004-09-16 03:28:13 +00005622 const Py_UNICODE blank = ' ';
5623 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005624 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005625 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005626 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5627 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005628 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5629 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005630 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005631 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632
Tim Peters05eba1f2004-08-27 21:32:02 +00005633 fseq = PySequence_Fast(seq, "");
5634 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005635 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005636 }
5637
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005638 /* NOTE: the following code can't call back into Python code,
5639 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005640 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005641
Tim Peters05eba1f2004-08-27 21:32:02 +00005642 seqlen = PySequence_Fast_GET_SIZE(fseq);
5643 /* If empty sequence, return u"". */
5644 if (seqlen == 0) {
5645 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5646 goto Done;
5647 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005648 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005649 /* If singleton sequence with an exact Unicode, return that. */
5650 if (seqlen == 1) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005651 item = items[0];
Tim Peters05eba1f2004-08-27 21:32:02 +00005652 if (PyUnicode_CheckExact(item)) {
5653 Py_INCREF(item);
5654 res = (PyUnicodeObject *)item;
5655 goto Done;
5656 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005657 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005658 else {
5659 /* Set up sep and seplen */
5660 if (separator == NULL) {
5661 sep = &blank;
5662 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005663 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005664 else {
5665 if (!PyUnicode_Check(separator)) {
5666 PyErr_Format(PyExc_TypeError,
5667 "separator: expected str instance,"
5668 " %.80s found",
5669 Py_TYPE(separator)->tp_name);
5670 goto onError;
5671 }
5672 sep = PyUnicode_AS_UNICODE(separator);
5673 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005674 }
5675 }
5676
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005677 /* There are at least two things to join, or else we have a subclass
5678 * of str in the sequence.
5679 * Do a pre-pass to figure out the total amount of space we'll
5680 * need (sz), and see whether all argument are strings.
5681 */
5682 sz = 0;
5683 for (i = 0; i < seqlen; i++) {
5684 const Py_ssize_t old_sz = sz;
5685 item = items[i];
Guido van Rossum98297ee2007-11-06 21:34:58 +00005686 if (!PyUnicode_Check(item)) {
5687 PyErr_Format(PyExc_TypeError,
5688 "sequence item %zd: expected str instance,"
5689 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005690 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005691 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005692 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005693 sz += PyUnicode_GET_SIZE(item);
5694 if (i != 0)
5695 sz += seplen;
5696 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5697 PyErr_SetString(PyExc_OverflowError,
5698 "join() result is too long for a Python string");
5699 goto onError;
5700 }
5701 }
Tim Petersced69f82003-09-16 20:30:58 +00005702
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005703 res = _PyUnicode_New(sz);
5704 if (res == NULL)
5705 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005706
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005707 /* Catenate everything. */
5708 res_p = PyUnicode_AS_UNICODE(res);
5709 for (i = 0; i < seqlen; ++i) {
5710 Py_ssize_t itemlen;
5711 item = items[i];
5712 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005713 /* Copy item, and maybe the separator. */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005714 if (i) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005715 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005716 res_p += seplen;
5717 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005718 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5719 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005720 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005721
Tim Peters8ce9f162004-08-27 01:49:32 +00005722 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005723 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 return (PyObject *)res;
5725
5726 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005727 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005728 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 return NULL;
5730}
5731
Tim Petersced69f82003-09-16 20:30:58 +00005732static
5733PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005734 Py_ssize_t left,
5735 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 Py_UNICODE fill)
5737{
5738 PyUnicodeObject *u;
5739
5740 if (left < 0)
5741 left = 0;
5742 if (right < 0)
5743 right = 0;
5744
Tim Peters7a29bd52001-09-12 03:03:31 +00005745 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 Py_INCREF(self);
5747 return self;
5748 }
5749
5750 u = _PyUnicode_New(left + self->length + right);
5751 if (u) {
5752 if (left)
5753 Py_UNICODE_FILL(u->str, fill, left);
5754 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5755 if (right)
5756 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5757 }
5758
5759 return u;
5760}
5761
5762#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005763 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 if (!str) \
5765 goto onError; \
5766 if (PyList_Append(list, str)) { \
5767 Py_DECREF(str); \
5768 goto onError; \
5769 } \
5770 else \
5771 Py_DECREF(str);
5772
5773static
5774PyObject *split_whitespace(PyUnicodeObject *self,
5775 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005776 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005778 register Py_ssize_t i;
5779 register Py_ssize_t j;
5780 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005782 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783
5784 for (i = j = 0; i < len; ) {
5785 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005786 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 i++;
5788 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005789 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 i++;
5791 if (j < i) {
5792 if (maxcount-- <= 0)
5793 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005794 SPLIT_APPEND(buf, j, i);
5795 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 i++;
5797 j = i;
5798 }
5799 }
5800 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005801 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 }
5803 return list;
5804
5805 onError:
5806 Py_DECREF(list);
5807 return NULL;
5808}
5809
5810PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005811 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005813 register Py_ssize_t i;
5814 register Py_ssize_t j;
5815 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 PyObject *list;
5817 PyObject *str;
5818 Py_UNICODE *data;
5819
5820 string = PyUnicode_FromObject(string);
5821 if (string == NULL)
5822 return NULL;
5823 data = PyUnicode_AS_UNICODE(string);
5824 len = PyUnicode_GET_SIZE(string);
5825
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 list = PyList_New(0);
5827 if (!list)
5828 goto onError;
5829
5830 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005832
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005834 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836
5837 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005838 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 if (i < len) {
5840 if (data[i] == '\r' && i + 1 < len &&
5841 data[i+1] == '\n')
5842 i += 2;
5843 else
5844 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005845 if (keepends)
5846 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 }
Guido van Rossum86662912000-04-11 15:38:46 +00005848 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 j = i;
5850 }
5851 if (j < len) {
5852 SPLIT_APPEND(data, j, len);
5853 }
5854
5855 Py_DECREF(string);
5856 return list;
5857
5858 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005859 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 Py_DECREF(string);
5861 return NULL;
5862}
5863
Tim Petersced69f82003-09-16 20:30:58 +00005864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865PyObject *split_char(PyUnicodeObject *self,
5866 PyObject *list,
5867 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005870 register Py_ssize_t i;
5871 register Py_ssize_t j;
5872 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005874 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
5876 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005877 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 if (maxcount-- <= 0)
5879 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005880 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 i = j = i + 1;
5882 } else
5883 i++;
5884 }
5885 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005886 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 }
5888 return list;
5889
5890 onError:
5891 Py_DECREF(list);
5892 return NULL;
5893}
5894
Tim Petersced69f82003-09-16 20:30:58 +00005895static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896PyObject *split_substring(PyUnicodeObject *self,
5897 PyObject *list,
5898 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005899 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005901 register Py_ssize_t i;
5902 register Py_ssize_t j;
5903 Py_ssize_t len = self->length;
5904 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 PyObject *str;
5906
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005907 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 if (Py_UNICODE_MATCH(self, i, substring)) {
5909 if (maxcount-- <= 0)
5910 break;
5911 SPLIT_APPEND(self->str, j, i);
5912 i = j = i + sublen;
5913 } else
5914 i++;
5915 }
5916 if (j <= len) {
5917 SPLIT_APPEND(self->str, j, len);
5918 }
5919 return list;
5920
5921 onError:
5922 Py_DECREF(list);
5923 return NULL;
5924}
5925
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005926static
5927PyObject *rsplit_whitespace(PyUnicodeObject *self,
5928 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005929 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005931 register Py_ssize_t i;
5932 register Py_ssize_t j;
5933 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005934 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005935 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005936
5937 for (i = j = len - 1; i >= 0; ) {
5938 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005939 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005940 i--;
5941 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005942 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005943 i--;
5944 if (j > i) {
5945 if (maxcount-- <= 0)
5946 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005947 SPLIT_APPEND(buf, i + 1, j + 1);
5948 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005949 i--;
5950 j = i;
5951 }
5952 }
5953 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005954 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005955 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005956 if (PyList_Reverse(list) < 0)
5957 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005958 return list;
5959
5960 onError:
5961 Py_DECREF(list);
5962 return NULL;
5963}
5964
5965static
5966PyObject *rsplit_char(PyUnicodeObject *self,
5967 PyObject *list,
5968 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005969 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005970{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005971 register Py_ssize_t i;
5972 register Py_ssize_t j;
5973 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005974 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005975 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005976
5977 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005978 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005979 if (maxcount-- <= 0)
5980 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005981 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005982 j = i = i - 1;
5983 } else
5984 i--;
5985 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005986 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005987 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005988 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005989 if (PyList_Reverse(list) < 0)
5990 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005991 return list;
5992
5993 onError:
5994 Py_DECREF(list);
5995 return NULL;
5996}
5997
5998static
5999PyObject *rsplit_substring(PyUnicodeObject *self,
6000 PyObject *list,
6001 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006002 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006003{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006004 register Py_ssize_t i;
6005 register Py_ssize_t j;
6006 Py_ssize_t len = self->length;
6007 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006008 PyObject *str;
6009
6010 for (i = len - sublen, j = len; i >= 0; ) {
6011 if (Py_UNICODE_MATCH(self, i, substring)) {
6012 if (maxcount-- <= 0)
6013 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006014 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006015 j = i;
6016 i -= sublen;
6017 } else
6018 i--;
6019 }
6020 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006021 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006022 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006023 if (PyList_Reverse(list) < 0)
6024 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006025 return list;
6026
6027 onError:
6028 Py_DECREF(list);
6029 return NULL;
6030}
6031
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032#undef SPLIT_APPEND
6033
6034static
6035PyObject *split(PyUnicodeObject *self,
6036 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006037 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038{
6039 PyObject *list;
6040
6041 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006042 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043
6044 list = PyList_New(0);
6045 if (!list)
6046 return NULL;
6047
6048 if (substring == NULL)
6049 return split_whitespace(self,list,maxcount);
6050
6051 else if (substring->length == 1)
6052 return split_char(self,list,substring->str[0],maxcount);
6053
6054 else if (substring->length == 0) {
6055 Py_DECREF(list);
6056 PyErr_SetString(PyExc_ValueError, "empty separator");
6057 return NULL;
6058 }
6059 else
6060 return split_substring(self,list,substring,maxcount);
6061}
6062
Tim Petersced69f82003-09-16 20:30:58 +00006063static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006064PyObject *rsplit(PyUnicodeObject *self,
6065 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006066 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006067{
6068 PyObject *list;
6069
6070 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006071 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006072
6073 list = PyList_New(0);
6074 if (!list)
6075 return NULL;
6076
6077 if (substring == NULL)
6078 return rsplit_whitespace(self,list,maxcount);
6079
6080 else if (substring->length == 1)
6081 return rsplit_char(self,list,substring->str[0],maxcount);
6082
6083 else if (substring->length == 0) {
6084 Py_DECREF(list);
6085 PyErr_SetString(PyExc_ValueError, "empty separator");
6086 return NULL;
6087 }
6088 else
6089 return rsplit_substring(self,list,substring,maxcount);
6090}
6091
6092static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093PyObject *replace(PyUnicodeObject *self,
6094 PyUnicodeObject *str1,
6095 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006096 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
6098 PyUnicodeObject *u;
6099
6100 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006101 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102
Thomas Wouters477c8d52006-05-27 19:21:47 +00006103 if (str1->length == str2->length) {
6104 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006105 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006106 if (str1->length == 1) {
6107 /* replace characters */
6108 Py_UNICODE u1, u2;
6109 if (!findchar(self->str, self->length, str1->str[0]))
6110 goto nothing;
6111 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6112 if (!u)
6113 return NULL;
6114 Py_UNICODE_COPY(u->str, self->str, self->length);
6115 u1 = str1->str[0];
6116 u2 = str2->str[0];
6117 for (i = 0; i < u->length; i++)
6118 if (u->str[i] == u1) {
6119 if (--maxcount < 0)
6120 break;
6121 u->str[i] = u2;
6122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006124 i = fastsearch(
6125 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006127 if (i < 0)
6128 goto nothing;
6129 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6130 if (!u)
6131 return NULL;
6132 Py_UNICODE_COPY(u->str, self->str, self->length);
6133 while (i <= self->length - str1->length)
6134 if (Py_UNICODE_MATCH(self, i, str1)) {
6135 if (--maxcount < 0)
6136 break;
6137 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6138 i += str1->length;
6139 } else
6140 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006143
6144 Py_ssize_t n, i, j, e;
6145 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 Py_UNICODE *p;
6147
6148 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006149 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 if (n > maxcount)
6151 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006152 if (n == 0)
6153 goto nothing;
6154 /* new_size = self->length + n * (str2->length - str1->length)); */
6155 delta = (str2->length - str1->length);
6156 if (delta == 0) {
6157 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006159 product = n * (str2->length - str1->length);
6160 if ((product / (str2->length - str1->length)) != n) {
6161 PyErr_SetString(PyExc_OverflowError,
6162 "replace string is too long");
6163 return NULL;
6164 }
6165 new_size = self->length + product;
6166 if (new_size < 0) {
6167 PyErr_SetString(PyExc_OverflowError,
6168 "replace string is too long");
6169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 }
6171 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006172 u = _PyUnicode_New(new_size);
6173 if (!u)
6174 return NULL;
6175 i = 0;
6176 p = u->str;
6177 e = self->length - str1->length;
6178 if (str1->length > 0) {
6179 while (n-- > 0) {
6180 /* look for next match */
6181 j = i;
6182 while (j <= e) {
6183 if (Py_UNICODE_MATCH(self, j, str1))
6184 break;
6185 j++;
6186 }
6187 if (j > i) {
6188 if (j > e)
6189 break;
6190 /* copy unchanged part [i:j] */
6191 Py_UNICODE_COPY(p, self->str+i, j-i);
6192 p += j - i;
6193 }
6194 /* copy substitution string */
6195 if (str2->length > 0) {
6196 Py_UNICODE_COPY(p, str2->str, str2->length);
6197 p += str2->length;
6198 }
6199 i = j + str1->length;
6200 }
6201 if (i < self->length)
6202 /* copy tail [i:] */
6203 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6204 } else {
6205 /* interleave */
6206 while (n > 0) {
6207 Py_UNICODE_COPY(p, str2->str, str2->length);
6208 p += str2->length;
6209 if (--n <= 0)
6210 break;
6211 *p++ = self->str[i++];
6212 }
6213 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006217
6218nothing:
6219 /* nothing to replace; return original string (when possible) */
6220 if (PyUnicode_CheckExact(self)) {
6221 Py_INCREF(self);
6222 return (PyObject *) self;
6223 }
6224 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225}
6226
6227/* --- Unicode Object Methods --------------------------------------------- */
6228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006229PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006230"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231\n\
6232Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006233characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234
6235static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006236unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 return fixup(self, fixtitle);
6239}
6240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006241PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006242"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243\n\
6244Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006245have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246
6247static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006248unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 return fixup(self, fixcapitalize);
6251}
6252
6253#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006254PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006255"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256\n\
6257Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006258normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259
6260static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006261unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262{
6263 PyObject *list;
6264 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006265 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 /* Split into words */
6268 list = split(self, NULL, -1);
6269 if (!list)
6270 return NULL;
6271
6272 /* Capitalize each word */
6273 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6274 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6275 fixcapitalize);
6276 if (item == NULL)
6277 goto onError;
6278 Py_DECREF(PyList_GET_ITEM(list, i));
6279 PyList_SET_ITEM(list, i, item);
6280 }
6281
6282 /* Join the words to form a new string */
6283 item = PyUnicode_Join(NULL, list);
6284
6285onError:
6286 Py_DECREF(list);
6287 return (PyObject *)item;
6288}
6289#endif
6290
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006291/* Argument converter. Coerces to a single unicode character */
6292
6293static int
6294convert_uc(PyObject *obj, void *addr)
6295{
6296 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6297 PyObject *uniobj;
6298 Py_UNICODE *unistr;
6299
6300 uniobj = PyUnicode_FromObject(obj);
6301 if (uniobj == NULL) {
6302 PyErr_SetString(PyExc_TypeError,
6303 "The fill character cannot be converted to Unicode");
6304 return 0;
6305 }
6306 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6307 PyErr_SetString(PyExc_TypeError,
6308 "The fill character must be exactly one character long");
6309 Py_DECREF(uniobj);
6310 return 0;
6311 }
6312 unistr = PyUnicode_AS_UNICODE(uniobj);
6313 *fillcharloc = unistr[0];
6314 Py_DECREF(uniobj);
6315 return 1;
6316}
6317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006318PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006319"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006321Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006322done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
6324static PyObject *
6325unicode_center(PyUnicodeObject *self, PyObject *args)
6326{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006327 Py_ssize_t marg, left;
6328 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006329 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330
Thomas Woutersde017742006-02-16 19:34:37 +00006331 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 return NULL;
6333
Tim Peters7a29bd52001-09-12 03:03:31 +00006334 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 Py_INCREF(self);
6336 return (PyObject*) self;
6337 }
6338
6339 marg = width - self->length;
6340 left = marg / 2 + (marg & width & 1);
6341
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006342 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
Marc-André Lemburge5034372000-08-08 08:04:29 +00006345#if 0
6346
6347/* This code should go into some future Unicode collation support
6348 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006349 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006350
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006351/* speedy UTF-16 code point order comparison */
6352/* gleaned from: */
6353/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6354
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006355static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006356{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006357 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006358 0, 0, 0, 0, 0, 0, 0, 0,
6359 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006360 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006361};
6362
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363static int
6364unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6365{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006366 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 Py_UNICODE *s1 = str1->str;
6369 Py_UNICODE *s2 = str2->str;
6370
6371 len1 = str1->length;
6372 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006373
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006375 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006376
6377 c1 = *s1++;
6378 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006379
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006380 if (c1 > (1<<11) * 26)
6381 c1 += utf16Fixup[c1>>11];
6382 if (c2 > (1<<11) * 26)
6383 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006384 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006385
6386 if (c1 != c2)
6387 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006388
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006389 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 }
6391
6392 return (len1 < len2) ? -1 : (len1 != len2);
6393}
6394
Marc-André Lemburge5034372000-08-08 08:04:29 +00006395#else
6396
6397static int
6398unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6399{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006400 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006401
6402 Py_UNICODE *s1 = str1->str;
6403 Py_UNICODE *s2 = str2->str;
6404
6405 len1 = str1->length;
6406 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006407
Marc-André Lemburge5034372000-08-08 08:04:29 +00006408 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006409 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006410
Fredrik Lundh45714e92001-06-26 16:39:36 +00006411 c1 = *s1++;
6412 c2 = *s2++;
6413
6414 if (c1 != c2)
6415 return (c1 < c2) ? -1 : 1;
6416
Marc-André Lemburge5034372000-08-08 08:04:29 +00006417 len1--; len2--;
6418 }
6419
6420 return (len1 < len2) ? -1 : (len1 != len2);
6421}
6422
6423#endif
6424
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425int PyUnicode_Compare(PyObject *left,
6426 PyObject *right)
6427{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006428 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6429 return unicode_compare((PyUnicodeObject *)left,
6430 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006431 PyErr_Format(PyExc_TypeError,
6432 "Can't compare %.100s and %.100s",
6433 left->ob_type->tp_name,
6434 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 return -1;
6436}
6437
Martin v. Löwis5b222132007-06-10 09:51:05 +00006438int
6439PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6440{
6441 int i;
6442 Py_UNICODE *id;
6443 assert(PyUnicode_Check(uni));
6444 id = PyUnicode_AS_UNICODE(uni);
6445 /* Compare Unicode string and source character set string */
6446 for (i = 0; id[i] && str[i]; i++)
6447 if (id[i] != str[i])
6448 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6449 if (id[i])
6450 return 1; /* uni is longer */
6451 if (str[i])
6452 return -1; /* str is longer */
6453 return 0;
6454}
6455
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006456PyObject *PyUnicode_RichCompare(PyObject *left,
6457 PyObject *right,
6458 int op)
6459{
6460 int result;
6461
6462 result = PyUnicode_Compare(left, right);
6463 if (result == -1 && PyErr_Occurred())
6464 goto onError;
6465
6466 /* Convert the return value to a Boolean */
6467 switch (op) {
6468 case Py_EQ:
6469 result = (result == 0);
6470 break;
6471 case Py_NE:
6472 result = (result != 0);
6473 break;
6474 case Py_LE:
6475 result = (result <= 0);
6476 break;
6477 case Py_GE:
6478 result = (result >= 0);
6479 break;
6480 case Py_LT:
6481 result = (result == -1);
6482 break;
6483 case Py_GT:
6484 result = (result == 1);
6485 break;
6486 }
6487 return PyBool_FromLong(result);
6488
6489 onError:
6490
6491 /* Standard case
6492
6493 Type errors mean that PyUnicode_FromObject() could not convert
6494 one of the arguments (usually the right hand side) to Unicode,
6495 ie. we can't handle the comparison request. However, it is
6496 possible that the other object knows a comparison method, which
6497 is why we return Py_NotImplemented to give the other object a
6498 chance.
6499
6500 */
6501 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6502 PyErr_Clear();
6503 Py_INCREF(Py_NotImplemented);
6504 return Py_NotImplemented;
6505 }
6506 if (op != Py_EQ && op != Py_NE)
6507 return NULL;
6508
6509 /* Equality comparison.
6510
6511 This is a special case: we silence any PyExc_UnicodeDecodeError
6512 and instead turn it into a PyErr_UnicodeWarning.
6513
6514 */
6515 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6516 return NULL;
6517 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006518 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6519 (op == Py_EQ) ?
Benjamin Peterson142957c2008-07-04 19:55:29 +00006520 "equal comparison "
6521 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006522 "interpreting them as being unequal"
6523 :
6524 "Unicode unequal comparison "
Benjamin Peterson142957c2008-07-04 19:55:29 +00006525 "failed to convert both arguments to str - "
Skip Montanaro46fc3372007-08-12 11:44:53 +00006526 "interpreting them as being unequal",
6527 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006528 return NULL;
6529 result = (op == Py_NE);
6530 return PyBool_FromLong(result);
6531}
6532
Guido van Rossum403d68b2000-03-13 15:55:09 +00006533int PyUnicode_Contains(PyObject *container,
6534 PyObject *element)
6535{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006536 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006537 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006538
6539 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006540 sub = PyUnicode_FromObject(element);
6541 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006542 PyErr_Format(PyExc_TypeError,
6543 "'in <string>' requires string as left operand, not %s",
6544 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006546 }
6547
Thomas Wouters477c8d52006-05-27 19:21:47 +00006548 str = PyUnicode_FromObject(container);
6549 if (!str) {
6550 Py_DECREF(sub);
6551 return -1;
6552 }
6553
6554 result = stringlib_contains_obj(str, sub);
6555
6556 Py_DECREF(str);
6557 Py_DECREF(sub);
6558
Guido van Rossum403d68b2000-03-13 15:55:09 +00006559 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006560}
6561
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562/* Concat to string or Unicode object giving a new Unicode object. */
6563
6564PyObject *PyUnicode_Concat(PyObject *left,
6565 PyObject *right)
6566{
6567 PyUnicodeObject *u = NULL, *v = NULL, *w;
6568
6569 /* Coerce the two arguments */
6570 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6571 if (u == NULL)
6572 goto onError;
6573 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6574 if (v == NULL)
6575 goto onError;
6576
6577 /* Shortcuts */
6578 if (v == unicode_empty) {
6579 Py_DECREF(v);
6580 return (PyObject *)u;
6581 }
6582 if (u == unicode_empty) {
6583 Py_DECREF(u);
6584 return (PyObject *)v;
6585 }
6586
6587 /* Concat the two Unicode strings */
6588 w = _PyUnicode_New(u->length + v->length);
6589 if (w == NULL)
6590 goto onError;
6591 Py_UNICODE_COPY(w->str, u->str, u->length);
6592 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6593
6594 Py_DECREF(u);
6595 Py_DECREF(v);
6596 return (PyObject *)w;
6597
6598onError:
6599 Py_XDECREF(u);
6600 Py_XDECREF(v);
6601 return NULL;
6602}
6603
Walter Dörwald1ab83302007-05-18 17:15:44 +00006604void
6605PyUnicode_Append(PyObject **pleft, PyObject *right)
6606{
6607 PyObject *new;
6608 if (*pleft == NULL)
6609 return;
6610 if (right == NULL || !PyUnicode_Check(*pleft)) {
6611 Py_DECREF(*pleft);
6612 *pleft = NULL;
6613 return;
6614 }
6615 new = PyUnicode_Concat(*pleft, right);
6616 Py_DECREF(*pleft);
6617 *pleft = new;
6618}
6619
6620void
6621PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6622{
6623 PyUnicode_Append(pleft, right);
6624 Py_XDECREF(right);
6625}
6626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006627PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628"S.count(sub[, start[, end]]) -> int\n\
6629\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006630Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006631string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006632interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633
6634static PyObject *
6635unicode_count(PyUnicodeObject *self, PyObject *args)
6636{
6637 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006638 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006639 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 PyObject *result;
6641
Guido van Rossumb8872e62000-05-09 14:14:27 +00006642 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6643 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 return NULL;
6645
6646 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006647 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 if (substring == NULL)
6649 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006650
Thomas Wouters477c8d52006-05-27 19:21:47 +00006651 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652
Christian Heimes217cfd12007-12-02 14:31:20 +00006653 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006654 stringlib_count(self->str + start, end - start,
6655 substring->str, substring->length)
6656 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
6658 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006659
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 return result;
6661}
6662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006664"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006666Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006667to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006668handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6670'xmlcharrefreplace' as well as any other name registered with\n\
6671codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673static PyObject *
6674unicode_encode(PyUnicodeObject *self, PyObject *args)
6675{
6676 char *encoding = NULL;
6677 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006678 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6681 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006682 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006683 if (v == NULL)
6684 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006685 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006686 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006687 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006688 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006689 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006690 Py_DECREF(v);
6691 return NULL;
6692 }
6693 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006694
6695 onError:
6696 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006697}
6698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006699PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006700"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701\n\
6702Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006703If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704
6705static PyObject*
6706unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6707{
6708 Py_UNICODE *e;
6709 Py_UNICODE *p;
6710 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006711 Py_UNICODE *qe;
6712 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 PyUnicodeObject *u;
6714 int tabsize = 8;
6715
6716 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6717 return NULL;
6718
Thomas Wouters7e474022000-07-16 12:04:32 +00006719 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006720 i = 0; /* chars up to and including most recent \n or \r */
6721 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6722 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 for (p = self->str; p < e; p++)
6724 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006725 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006726 incr = tabsize - (j % tabsize); /* cannot overflow */
6727 if (j > PY_SSIZE_T_MAX - incr)
6728 goto overflow1;
6729 j += incr;
6730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 }
6732 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006733 if (j > PY_SSIZE_T_MAX - 1)
6734 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 j++;
6736 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006737 if (i > PY_SSIZE_T_MAX - j)
6738 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006740 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
6742 }
6743
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006744 if (i > PY_SSIZE_T_MAX - j)
6745 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006746
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 /* Second pass: create output string and fill it */
6748 u = _PyUnicode_New(i + j);
6749 if (!u)
6750 return NULL;
6751
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006752 j = 0; /* same as in first pass */
6753 q = u->str; /* next output char */
6754 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
6756 for (p = self->str; p < e; p++)
6757 if (*p == '\t') {
6758 if (tabsize > 0) {
6759 i = tabsize - (j % tabsize);
6760 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006761 while (i--) {
6762 if (q >= qe)
6763 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 }
6767 }
6768 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006769 if (q >= qe)
6770 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006772 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 if (*p == '\n' || *p == '\r')
6774 j = 0;
6775 }
6776
6777 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006778
6779 overflow2:
6780 Py_DECREF(u);
6781 overflow1:
6782 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784}
6785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006786PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006787"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788\n\
6789Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006790such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791arguments start and end are interpreted as in slice notation.\n\
6792\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006793Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794
6795static PyObject *
6796unicode_find(PyUnicodeObject *self, PyObject *args)
6797{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006798 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006799 Py_ssize_t start;
6800 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006801 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
Christian Heimes9cd17752007-11-18 19:35:23 +00006803 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
Thomas Wouters477c8d52006-05-27 19:21:47 +00006806 result = stringlib_find_slice(
6807 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6808 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6809 start, end
6810 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
6812 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006813
Christian Heimes217cfd12007-12-02 14:31:20 +00006814 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815}
6816
6817static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006818unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819{
6820 if (index < 0 || index >= self->length) {
6821 PyErr_SetString(PyExc_IndexError, "string index out of range");
6822 return NULL;
6823 }
6824
6825 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6826}
6827
Guido van Rossumc2504932007-09-18 19:42:40 +00006828/* Believe it or not, this produces the same value for ASCII strings
6829 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006831unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832{
Guido van Rossumc2504932007-09-18 19:42:40 +00006833 Py_ssize_t len;
6834 Py_UNICODE *p;
6835 long x;
6836
6837 if (self->hash != -1)
6838 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006839 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006840 p = self->str;
6841 x = *p << 7;
6842 while (--len >= 0)
6843 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006844 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006845 if (x == -1)
6846 x = -2;
6847 self->hash = x;
6848 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849}
6850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006851PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006852"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006854Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855
6856static PyObject *
6857unicode_index(PyUnicodeObject *self, PyObject *args)
6858{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006859 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006860 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006861 Py_ssize_t start;
6862 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863
Christian Heimes9cd17752007-11-18 19:35:23 +00006864 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866
Thomas Wouters477c8d52006-05-27 19:21:47 +00006867 result = stringlib_find_slice(
6868 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6869 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6870 start, end
6871 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872
6873 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006874
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 if (result < 0) {
6876 PyErr_SetString(PyExc_ValueError, "substring not found");
6877 return NULL;
6878 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006879
Christian Heimes217cfd12007-12-02 14:31:20 +00006880 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881}
6882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006883PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006884"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006886Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888
6889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006890unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891{
6892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6893 register const Py_UNICODE *e;
6894 int cased;
6895
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 /* Shortcut for single character strings */
6897 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006900 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006901 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006902 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006903
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 e = p + PyUnicode_GET_SIZE(self);
6905 cased = 0;
6906 for (; p < e; p++) {
6907 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006908
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 else if (!cased && Py_UNICODE_ISLOWER(ch))
6912 cased = 1;
6913 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006914 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915}
6916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006917PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006918"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006920Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006921at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922
6923static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006924unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925{
6926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6927 register const Py_UNICODE *e;
6928 int cased;
6929
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 /* Shortcut for single character strings */
6931 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006932 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006934 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006935 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 e = p + PyUnicode_GET_SIZE(self);
6939 cased = 0;
6940 for (; p < e; p++) {
6941 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 else if (!cased && Py_UNICODE_ISUPPER(ch))
6946 cased = 1;
6947 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006948 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949}
6950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006951PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006952"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006954Return True if S is a titlecased string and there is at least one\n\
6955character in S, i.e. upper- and titlecase characters may only\n\
6956follow uncased characters and lowercase characters only cased ones.\n\
6957Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958
6959static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006960unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961{
6962 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6963 register const Py_UNICODE *e;
6964 int cased, previous_is_cased;
6965
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 /* Shortcut for single character strings */
6967 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006968 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6969 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006971 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006972 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006973 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006974
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 e = p + PyUnicode_GET_SIZE(self);
6976 cased = 0;
6977 previous_is_cased = 0;
6978 for (; p < e; p++) {
6979 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006980
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6982 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006983 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 previous_is_cased = 1;
6985 cased = 1;
6986 }
6987 else if (Py_UNICODE_ISLOWER(ch)) {
6988 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006989 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 previous_is_cased = 1;
6991 cased = 1;
6992 }
6993 else
6994 previous_is_cased = 0;
6995 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006996 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997}
6998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006999PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007000"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007002Return True if all characters in S are whitespace\n\
7003and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004
7005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007006unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007{
7008 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7009 register const Py_UNICODE *e;
7010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 /* Shortcut for single character strings */
7012 if (PyUnicode_GET_SIZE(self) == 1 &&
7013 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007014 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007016 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007017 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007018 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007019
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 e = p + PyUnicode_GET_SIZE(self);
7021 for (; p < e; p++) {
7022 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007023 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007025 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026}
7027
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007028PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007029"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007030\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007031Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007032and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007033
7034static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007035unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007036{
7037 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7038 register const Py_UNICODE *e;
7039
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007040 /* Shortcut for single character strings */
7041 if (PyUnicode_GET_SIZE(self) == 1 &&
7042 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007043 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007044
7045 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007046 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007047 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007048
7049 e = p + PyUnicode_GET_SIZE(self);
7050 for (; p < e; p++) {
7051 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007052 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007053 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007054 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007055}
7056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007057PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007058"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007059\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007060Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007061and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007062
7063static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007064unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007065{
7066 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7067 register const Py_UNICODE *e;
7068
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007069 /* Shortcut for single character strings */
7070 if (PyUnicode_GET_SIZE(self) == 1 &&
7071 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007072 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007073
7074 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007075 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007076 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007077
7078 e = p + PyUnicode_GET_SIZE(self);
7079 for (; p < e; p++) {
7080 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007081 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007082 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007083 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007084}
7085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007086PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007087"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007089Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007090False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091
7092static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007093unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094{
7095 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7096 register const Py_UNICODE *e;
7097
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 /* Shortcut for single character strings */
7099 if (PyUnicode_GET_SIZE(self) == 1 &&
7100 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007101 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007103 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007104 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007105 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007106
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 e = p + PyUnicode_GET_SIZE(self);
7108 for (; p < e; p++) {
7109 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007110 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007112 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113}
7114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007115PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007116"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007118Return True if all characters in S are digits\n\
7119and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
7121static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007122unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123{
7124 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7125 register const Py_UNICODE *e;
7126
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127 /* Shortcut for single character strings */
7128 if (PyUnicode_GET_SIZE(self) == 1 &&
7129 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007130 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007132 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007133 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007134 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007135
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 e = p + PyUnicode_GET_SIZE(self);
7137 for (; p < e; p++) {
7138 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007139 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007141 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142}
7143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007145"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007147Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007151unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152{
7153 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7154 register const Py_UNICODE *e;
7155
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 /* Shortcut for single character strings */
7157 if (PyUnicode_GET_SIZE(self) == 1 &&
7158 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007159 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007161 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007162 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007163 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007164
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 e = p + PyUnicode_GET_SIZE(self);
7166 for (; p < e; p++) {
7167 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007168 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007170 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171}
7172
Martin v. Löwis47383402007-08-15 07:32:56 +00007173int
7174PyUnicode_IsIdentifier(PyObject *self)
7175{
7176 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7177 register const Py_UNICODE *e;
7178
7179 /* Special case for empty strings */
7180 if (PyUnicode_GET_SIZE(self) == 0)
7181 return 0;
7182
7183 /* PEP 3131 says that the first character must be in
7184 XID_Start and subsequent characters in XID_Continue,
7185 and for the ASCII range, the 2.x rules apply (i.e
7186 start with letters and underscore, continue with
7187 letters, digits, underscore). However, given the current
7188 definition of XID_Start and XID_Continue, it is sufficient
7189 to check just for these, except that _ must be allowed
7190 as starting an identifier. */
7191 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7192 return 0;
7193
7194 e = p + PyUnicode_GET_SIZE(self);
7195 for (p++; p < e; p++) {
7196 if (!_PyUnicode_IsXidContinue(*p))
7197 return 0;
7198 }
7199 return 1;
7200}
7201
7202PyDoc_STRVAR(isidentifier__doc__,
7203"S.isidentifier() -> bool\n\
7204\n\
7205Return True if S is a valid identifier according\n\
7206to the language definition.");
7207
7208static PyObject*
7209unicode_isidentifier(PyObject *self)
7210{
7211 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7212}
7213
Georg Brandl559e5d72008-06-11 18:37:52 +00007214PyDoc_STRVAR(isprintable__doc__,
7215"S.isprintable() -> bool\n\
7216\n\
7217Return True if all characters in S are considered\n\
7218printable in repr() or S is empty, False otherwise.");
7219
7220static PyObject*
7221unicode_isprintable(PyObject *self)
7222{
7223 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7224 register const Py_UNICODE *e;
7225
7226 /* Shortcut for single character strings */
7227 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7228 Py_RETURN_TRUE;
7229 }
7230
7231 e = p + PyUnicode_GET_SIZE(self);
7232 for (; p < e; p++) {
7233 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7234 Py_RETURN_FALSE;
7235 }
7236 }
7237 Py_RETURN_TRUE;
7238}
7239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007240PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007241"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242\n\
7243Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007244sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245
7246static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007247unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007249 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250}
7251
Martin v. Löwis18e16552006-02-15 17:27:45 +00007252static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253unicode_length(PyUnicodeObject *self)
7254{
7255 return self->length;
7256}
7257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007258PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007259"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260\n\
7261Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007262done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
7264static PyObject *
7265unicode_ljust(PyUnicodeObject *self, PyObject *args)
7266{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007267 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007268 Py_UNICODE fillchar = ' ';
7269
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007270 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271 return NULL;
7272
Tim Peters7a29bd52001-09-12 03:03:31 +00007273 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 Py_INCREF(self);
7275 return (PyObject*) self;
7276 }
7277
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007278 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279}
7280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007281PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007282"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007284Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285
7286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007287unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289 return fixup(self, fixlower);
7290}
7291
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007292#define LEFTSTRIP 0
7293#define RIGHTSTRIP 1
7294#define BOTHSTRIP 2
7295
7296/* Arrays indexed by above */
7297static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7298
7299#define STRIPNAME(i) (stripformat[i]+3)
7300
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007301/* externally visible for str.strip(unicode) */
7302PyObject *
7303_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7304{
7305 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007306 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007307 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007308 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7309 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007310
Thomas Wouters477c8d52006-05-27 19:21:47 +00007311 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7312
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007313 i = 0;
7314 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007315 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7316 i++;
7317 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007318 }
7319
7320 j = len;
7321 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007322 do {
7323 j--;
7324 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7325 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007326 }
7327
7328 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007329 Py_INCREF(self);
7330 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007331 }
7332 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007333 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007334}
7335
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336
7337static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007338do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007340 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007341 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007342
7343 i = 0;
7344 if (striptype != RIGHTSTRIP) {
7345 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7346 i++;
7347 }
7348 }
7349
7350 j = len;
7351 if (striptype != LEFTSTRIP) {
7352 do {
7353 j--;
7354 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7355 j++;
7356 }
7357
7358 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7359 Py_INCREF(self);
7360 return (PyObject*)self;
7361 }
7362 else
7363 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364}
7365
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007366
7367static PyObject *
7368do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7369{
7370 PyObject *sep = NULL;
7371
7372 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7373 return NULL;
7374
7375 if (sep != NULL && sep != Py_None) {
7376 if (PyUnicode_Check(sep))
7377 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007378 else {
7379 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007380 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007381 STRIPNAME(striptype));
7382 return NULL;
7383 }
7384 }
7385
7386 return do_strip(self, striptype);
7387}
7388
7389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007390PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007391"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007392\n\
7393Return a copy of the string S with leading and trailing\n\
7394whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007395If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007396
7397static PyObject *
7398unicode_strip(PyUnicodeObject *self, PyObject *args)
7399{
7400 if (PyTuple_GET_SIZE(args) == 0)
7401 return do_strip(self, BOTHSTRIP); /* Common case */
7402 else
7403 return do_argstrip(self, BOTHSTRIP, args);
7404}
7405
7406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007407PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007408"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007409\n\
7410Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007411If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007412
7413static PyObject *
7414unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7415{
7416 if (PyTuple_GET_SIZE(args) == 0)
7417 return do_strip(self, LEFTSTRIP); /* Common case */
7418 else
7419 return do_argstrip(self, LEFTSTRIP, args);
7420}
7421
7422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007423PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007424"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007425\n\
7426Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007427If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007428
7429static PyObject *
7430unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7431{
7432 if (PyTuple_GET_SIZE(args) == 0)
7433 return do_strip(self, RIGHTSTRIP); /* Common case */
7434 else
7435 return do_argstrip(self, RIGHTSTRIP, args);
7436}
7437
7438
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007440unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441{
7442 PyUnicodeObject *u;
7443 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007444 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007445 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447 if (len < 0)
7448 len = 0;
7449
Tim Peters7a29bd52001-09-12 03:03:31 +00007450 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 /* no repeat, return original string */
7452 Py_INCREF(str);
7453 return (PyObject*) str;
7454 }
Tim Peters8f422462000-09-09 06:13:41 +00007455
7456 /* ensure # of chars needed doesn't overflow int and # of bytes
7457 * needed doesn't overflow size_t
7458 */
7459 nchars = len * str->length;
7460 if (len && nchars / len != str->length) {
7461 PyErr_SetString(PyExc_OverflowError,
7462 "repeated string is too long");
7463 return NULL;
7464 }
7465 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7466 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7467 PyErr_SetString(PyExc_OverflowError,
7468 "repeated string is too long");
7469 return NULL;
7470 }
7471 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 if (!u)
7473 return NULL;
7474
7475 p = u->str;
7476
Thomas Wouters477c8d52006-05-27 19:21:47 +00007477 if (str->length == 1 && len > 0) {
7478 Py_UNICODE_FILL(p, str->str[0], len);
7479 } else {
7480 Py_ssize_t done = 0; /* number of characters copied this far */
7481 if (done < nchars) {
7482 Py_UNICODE_COPY(p, str->str, str->length);
7483 done = str->length;
7484 }
7485 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007486 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007487 Py_UNICODE_COPY(p+done, p, n);
7488 done += n;
7489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 }
7491
7492 return (PyObject*) u;
7493}
7494
7495PyObject *PyUnicode_Replace(PyObject *obj,
7496 PyObject *subobj,
7497 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007498 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499{
7500 PyObject *self;
7501 PyObject *str1;
7502 PyObject *str2;
7503 PyObject *result;
7504
7505 self = PyUnicode_FromObject(obj);
7506 if (self == NULL)
7507 return NULL;
7508 str1 = PyUnicode_FromObject(subobj);
7509 if (str1 == NULL) {
7510 Py_DECREF(self);
7511 return NULL;
7512 }
7513 str2 = PyUnicode_FromObject(replobj);
7514 if (str2 == NULL) {
7515 Py_DECREF(self);
7516 Py_DECREF(str1);
7517 return NULL;
7518 }
Tim Petersced69f82003-09-16 20:30:58 +00007519 result = replace((PyUnicodeObject *)self,
7520 (PyUnicodeObject *)str1,
7521 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 maxcount);
7523 Py_DECREF(self);
7524 Py_DECREF(str1);
7525 Py_DECREF(str2);
7526 return result;
7527}
7528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007529PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007530"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531\n\
7532Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007533old replaced by new. If the optional argument count is\n\
7534given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535
7536static PyObject*
7537unicode_replace(PyUnicodeObject *self, PyObject *args)
7538{
7539 PyUnicodeObject *str1;
7540 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007541 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 PyObject *result;
7543
Martin v. Löwis18e16552006-02-15 17:27:45 +00007544 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545 return NULL;
7546 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7547 if (str1 == NULL)
7548 return NULL;
7549 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007550 if (str2 == NULL) {
7551 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007553 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554
7555 result = replace(self, str1, str2, maxcount);
7556
7557 Py_DECREF(str1);
7558 Py_DECREF(str2);
7559 return result;
7560}
7561
7562static
7563PyObject *unicode_repr(PyObject *unicode)
7564{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007565 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007566 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007567 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7568 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7569
7570 /* XXX(nnorwitz): rather than over-allocating, it would be
7571 better to choose a different scheme. Perhaps scan the
7572 first N-chars of the string and allocate based on that size.
7573 */
7574 /* Initial allocation is based on the longest-possible unichr
7575 escape.
7576
7577 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7578 unichr, so in this case it's the longest unichr escape. In
7579 narrow (UTF-16) builds this is five chars per source unichr
7580 since there are two unichrs in the surrogate pair, so in narrow
7581 (UTF-16) builds it's not the longest unichr escape.
7582
7583 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7584 so in the narrow (UTF-16) build case it's the longest unichr
7585 escape.
7586 */
7587
Walter Dörwald1ab83302007-05-18 17:15:44 +00007588 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007589 2 /* quotes */
7590#ifdef Py_UNICODE_WIDE
7591 + 10*size
7592#else
7593 + 6*size
7594#endif
7595 + 1);
7596 if (repr == NULL)
7597 return NULL;
7598
Walter Dörwald1ab83302007-05-18 17:15:44 +00007599 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007600
7601 /* Add quote */
7602 *p++ = (findchar(s, size, '\'') &&
7603 !findchar(s, size, '"')) ? '"' : '\'';
7604 while (size-- > 0) {
7605 Py_UNICODE ch = *s++;
7606
7607 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007608 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007609 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007610 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007611 continue;
7612 }
7613
Georg Brandl559e5d72008-06-11 18:37:52 +00007614 /* Map special whitespace to '\t', \n', '\r' */
7615 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007616 *p++ = '\\';
7617 *p++ = 't';
7618 }
7619 else if (ch == '\n') {
7620 *p++ = '\\';
7621 *p++ = 'n';
7622 }
7623 else if (ch == '\r') {
7624 *p++ = '\\';
7625 *p++ = 'r';
7626 }
7627
7628 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007629 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007630 *p++ = '\\';
7631 *p++ = 'x';
7632 *p++ = hexdigits[(ch >> 4) & 0x000F];
7633 *p++ = hexdigits[ch & 0x000F];
7634 }
7635
Georg Brandl559e5d72008-06-11 18:37:52 +00007636 /* Copy ASCII characters as-is */
7637 else if (ch < 0x7F) {
7638 *p++ = ch;
7639 }
7640
7641 /* Non-ASCII characters */
7642 else {
7643 Py_UCS4 ucs = ch;
7644
7645#ifndef Py_UNICODE_WIDE
7646 Py_UNICODE ch2 = 0;
7647 /* Get code point from surrogate pair */
7648 if (size > 0) {
7649 ch2 = *s;
7650 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7651 && ch2 <= 0xDFFF) {
7652 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7653 + 0x00010000;
7654 s++;
7655 size--;
7656 }
7657 }
7658#endif
7659 /* Map Unicode whitespace and control characters
7660 (categories Z* and C* except ASCII space)
7661 */
7662 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7663 /* Map 8-bit characters to '\xhh' */
7664 if (ucs <= 0xff) {
7665 *p++ = '\\';
7666 *p++ = 'x';
7667 *p++ = hexdigits[(ch >> 4) & 0x000F];
7668 *p++ = hexdigits[ch & 0x000F];
7669 }
7670 /* Map 21-bit characters to '\U00xxxxxx' */
7671 else if (ucs >= 0x10000) {
7672 *p++ = '\\';
7673 *p++ = 'U';
7674 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7675 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7676 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7677 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7678 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7679 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7680 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7681 *p++ = hexdigits[ucs & 0x0000000F];
7682 }
7683 /* Map 16-bit characters to '\uxxxx' */
7684 else {
7685 *p++ = '\\';
7686 *p++ = 'u';
7687 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7688 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7689 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7690 *p++ = hexdigits[ucs & 0x000F];
7691 }
7692 }
7693 /* Copy characters as-is */
7694 else {
7695 *p++ = ch;
7696#ifndef Py_UNICODE_WIDE
7697 if (ucs >= 0x10000)
7698 *p++ = ch2;
7699#endif
7700 }
7701 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007702 }
7703 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007704 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007705
7706 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007707 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007708 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709}
7710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007711PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007712"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713\n\
7714Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007715such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716arguments start and end are interpreted as in slice notation.\n\
7717\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007718Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
7720static PyObject *
7721unicode_rfind(PyUnicodeObject *self, PyObject *args)
7722{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007723 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007724 Py_ssize_t start;
7725 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007726 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727
Christian Heimes9cd17752007-11-18 19:35:23 +00007728 if (!_ParseTupleFinds(args, &substring, &start, &end))
7729 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
Thomas Wouters477c8d52006-05-27 19:21:47 +00007731 result = stringlib_rfind_slice(
7732 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7733 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7734 start, end
7735 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
7737 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007738
Christian Heimes217cfd12007-12-02 14:31:20 +00007739 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740}
7741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007742PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007743"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007745Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746
7747static PyObject *
7748unicode_rindex(PyUnicodeObject *self, PyObject *args)
7749{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007750 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007751 Py_ssize_t start;
7752 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007753 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754
Christian Heimes9cd17752007-11-18 19:35:23 +00007755 if (!_ParseTupleFinds(args, &substring, &start, &end))
7756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
Thomas Wouters477c8d52006-05-27 19:21:47 +00007758 result = stringlib_rfind_slice(
7759 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7760 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7761 start, end
7762 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763
7764 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007765
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 if (result < 0) {
7767 PyErr_SetString(PyExc_ValueError, "substring not found");
7768 return NULL;
7769 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007770 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771}
7772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007773PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007774"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007776Return S right justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007777done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778
7779static PyObject *
7780unicode_rjust(PyUnicodeObject *self, PyObject *args)
7781{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007782 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007783 Py_UNICODE fillchar = ' ';
7784
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007785 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 return NULL;
7787
Tim Peters7a29bd52001-09-12 03:03:31 +00007788 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 Py_INCREF(self);
7790 return (PyObject*) self;
7791 }
7792
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007793 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794}
7795
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796PyObject *PyUnicode_Split(PyObject *s,
7797 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007798 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799{
7800 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007801
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 s = PyUnicode_FromObject(s);
7803 if (s == NULL)
7804 return NULL;
7805 if (sep != NULL) {
7806 sep = PyUnicode_FromObject(sep);
7807 if (sep == NULL) {
7808 Py_DECREF(s);
7809 return NULL;
7810 }
7811 }
7812
7813 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7814
7815 Py_DECREF(s);
7816 Py_XDECREF(sep);
7817 return result;
7818}
7819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007820PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007821"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822\n\
7823Return a list of the words in S, using sep as the\n\
7824delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007825splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007826whitespace string is a separator and empty strings are\n\
7827removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828
7829static PyObject*
7830unicode_split(PyUnicodeObject *self, PyObject *args)
7831{
7832 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007833 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834
Martin v. Löwis18e16552006-02-15 17:27:45 +00007835 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 return NULL;
7837
7838 if (substring == Py_None)
7839 return split(self, NULL, maxcount);
7840 else if (PyUnicode_Check(substring))
7841 return split(self, (PyUnicodeObject *)substring, maxcount);
7842 else
7843 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7844}
7845
Thomas Wouters477c8d52006-05-27 19:21:47 +00007846PyObject *
7847PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7848{
7849 PyObject* str_obj;
7850 PyObject* sep_obj;
7851 PyObject* out;
7852
7853 str_obj = PyUnicode_FromObject(str_in);
7854 if (!str_obj)
7855 return NULL;
7856 sep_obj = PyUnicode_FromObject(sep_in);
7857 if (!sep_obj) {
7858 Py_DECREF(str_obj);
7859 return NULL;
7860 }
7861
7862 out = stringlib_partition(
7863 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7864 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7865 );
7866
7867 Py_DECREF(sep_obj);
7868 Py_DECREF(str_obj);
7869
7870 return out;
7871}
7872
7873
7874PyObject *
7875PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7876{
7877 PyObject* str_obj;
7878 PyObject* sep_obj;
7879 PyObject* out;
7880
7881 str_obj = PyUnicode_FromObject(str_in);
7882 if (!str_obj)
7883 return NULL;
7884 sep_obj = PyUnicode_FromObject(sep_in);
7885 if (!sep_obj) {
7886 Py_DECREF(str_obj);
7887 return NULL;
7888 }
7889
7890 out = stringlib_rpartition(
7891 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7892 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7893 );
7894
7895 Py_DECREF(sep_obj);
7896 Py_DECREF(str_obj);
7897
7898 return out;
7899}
7900
7901PyDoc_STRVAR(partition__doc__,
7902"S.partition(sep) -> (head, sep, tail)\n\
7903\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007904Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007905the separator itself, and the part after it. If the separator is not\n\
7906found, returns S and two empty strings.");
7907
7908static PyObject*
7909unicode_partition(PyUnicodeObject *self, PyObject *separator)
7910{
7911 return PyUnicode_Partition((PyObject *)self, separator);
7912}
7913
7914PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007915"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007916\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007917Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007918the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007919separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007920
7921static PyObject*
7922unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7923{
7924 return PyUnicode_RPartition((PyObject *)self, separator);
7925}
7926
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007927PyObject *PyUnicode_RSplit(PyObject *s,
7928 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007929 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007930{
7931 PyObject *result;
7932
7933 s = PyUnicode_FromObject(s);
7934 if (s == NULL)
7935 return NULL;
7936 if (sep != NULL) {
7937 sep = PyUnicode_FromObject(sep);
7938 if (sep == NULL) {
7939 Py_DECREF(s);
7940 return NULL;
7941 }
7942 }
7943
7944 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7945
7946 Py_DECREF(s);
7947 Py_XDECREF(sep);
7948 return result;
7949}
7950
7951PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007952"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007953\n\
7954Return a list of the words in S, using sep as the\n\
7955delimiter string, starting at the end of the string and\n\
7956working to the front. If maxsplit is given, at most maxsplit\n\
7957splits are done. If sep is not specified, any whitespace string\n\
7958is a separator.");
7959
7960static PyObject*
7961unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7962{
7963 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007964 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007965
Martin v. Löwis18e16552006-02-15 17:27:45 +00007966 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007967 return NULL;
7968
7969 if (substring == Py_None)
7970 return rsplit(self, NULL, maxcount);
7971 else if (PyUnicode_Check(substring))
7972 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7973 else
7974 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7975}
7976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007977PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007978"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979\n\
7980Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007981Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007982is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983
7984static PyObject*
7985unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7986{
Guido van Rossum86662912000-04-11 15:38:46 +00007987 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988
Guido van Rossum86662912000-04-11 15:38:46 +00007989 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 return NULL;
7991
Guido van Rossum86662912000-04-11 15:38:46 +00007992 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993}
7994
7995static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007996PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997{
Walter Dörwald346737f2007-05-31 10:44:43 +00007998 if (PyUnicode_CheckExact(self)) {
7999 Py_INCREF(self);
8000 return self;
8001 } else
8002 /* Subtype -- return genuine unicode string with the same value. */
8003 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8004 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005}
8006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008008"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009\n\
8010Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008011and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
8013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008014unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 return fixup(self, fixswapcase);
8017}
8018
Georg Brandlceee0772007-11-27 23:48:05 +00008019PyDoc_STRVAR(maketrans__doc__,
8020"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8021\n\
8022Return a translation table usable for str.translate().\n\
8023If there is only one argument, it must be a dictionary mapping Unicode\n\
8024ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008025Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008026If there are two arguments, they must be strings of equal length, and\n\
8027in the resulting dictionary, each character in x will be mapped to the\n\
8028character at the same position in y. If there is a third argument, it\n\
8029must be a string, whose characters will be mapped to None in the result.");
8030
8031static PyObject*
8032unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8033{
8034 PyObject *x, *y = NULL, *z = NULL;
8035 PyObject *new = NULL, *key, *value;
8036 Py_ssize_t i = 0;
8037 int res;
8038
8039 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8040 return NULL;
8041 new = PyDict_New();
8042 if (!new)
8043 return NULL;
8044 if (y != NULL) {
8045 /* x must be a string too, of equal length */
8046 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8047 if (!PyUnicode_Check(x)) {
8048 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8049 "be a string if there is a second argument");
8050 goto err;
8051 }
8052 if (PyUnicode_GET_SIZE(x) != ylen) {
8053 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8054 "arguments must have equal length");
8055 goto err;
8056 }
8057 /* create entries for translating chars in x to those in y */
8058 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008059 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8060 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008061 if (!key || !value)
8062 goto err;
8063 res = PyDict_SetItem(new, key, value);
8064 Py_DECREF(key);
8065 Py_DECREF(value);
8066 if (res < 0)
8067 goto err;
8068 }
8069 /* create entries for deleting chars in z */
8070 if (z != NULL) {
8071 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008072 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008073 if (!key)
8074 goto err;
8075 res = PyDict_SetItem(new, key, Py_None);
8076 Py_DECREF(key);
8077 if (res < 0)
8078 goto err;
8079 }
8080 }
8081 } else {
8082 /* x must be a dict */
8083 if (!PyDict_Check(x)) {
8084 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8085 "to maketrans it must be a dict");
8086 goto err;
8087 }
8088 /* copy entries into the new dict, converting string keys to int keys */
8089 while (PyDict_Next(x, &i, &key, &value)) {
8090 if (PyUnicode_Check(key)) {
8091 /* convert string keys to integer keys */
8092 PyObject *newkey;
8093 if (PyUnicode_GET_SIZE(key) != 1) {
8094 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8095 "table must be of length 1");
8096 goto err;
8097 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008098 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008099 if (!newkey)
8100 goto err;
8101 res = PyDict_SetItem(new, newkey, value);
8102 Py_DECREF(newkey);
8103 if (res < 0)
8104 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008105 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008106 /* just keep integer keys */
8107 if (PyDict_SetItem(new, key, value) < 0)
8108 goto err;
8109 } else {
8110 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8111 "be strings or integers");
8112 goto err;
8113 }
8114 }
8115 }
8116 return new;
8117 err:
8118 Py_DECREF(new);
8119 return NULL;
8120}
8121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008122PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008123"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124\n\
8125Return a copy of the string S, where all characters have been mapped\n\
8126through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008127Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008128Unmapped characters are left untouched. Characters mapped to None\n\
8129are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130
8131static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008132unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133{
Georg Brandlceee0772007-11-27 23:48:05 +00008134 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135}
8136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008137PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008138"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008140Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141
8142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008143unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 return fixup(self, fixupper);
8146}
8147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008148PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008149"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150\n\
8151Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008152of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153
8154static PyObject *
8155unicode_zfill(PyUnicodeObject *self, PyObject *args)
8156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008157 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 PyUnicodeObject *u;
8159
Martin v. Löwis18e16552006-02-15 17:27:45 +00008160 Py_ssize_t width;
8161 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 return NULL;
8163
8164 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008165 if (PyUnicode_CheckExact(self)) {
8166 Py_INCREF(self);
8167 return (PyObject*) self;
8168 }
8169 else
8170 return PyUnicode_FromUnicode(
8171 PyUnicode_AS_UNICODE(self),
8172 PyUnicode_GET_SIZE(self)
8173 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 }
8175
8176 fill = width - self->length;
8177
8178 u = pad(self, fill, 0, '0');
8179
Walter Dörwald068325e2002-04-15 13:36:47 +00008180 if (u == NULL)
8181 return NULL;
8182
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183 if (u->str[fill] == '+' || u->str[fill] == '-') {
8184 /* move sign to beginning of string */
8185 u->str[0] = u->str[fill];
8186 u->str[fill] = '0';
8187 }
8188
8189 return (PyObject*) u;
8190}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191
8192#if 0
8193static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008194unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195{
Christian Heimes2202f872008-02-06 14:31:34 +00008196 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197}
8198#endif
8199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008200PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008201"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008203Return True if S starts with the specified prefix, False otherwise.\n\
8204With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008205With optional end, stop comparing S at that position.\n\
8206prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207
8208static PyObject *
8209unicode_startswith(PyUnicodeObject *self,
8210 PyObject *args)
8211{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008212 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008214 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008215 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008216 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008218 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008219 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008221 if (PyTuple_Check(subobj)) {
8222 Py_ssize_t i;
8223 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8224 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8225 PyTuple_GET_ITEM(subobj, i));
8226 if (substring == NULL)
8227 return NULL;
8228 result = tailmatch(self, substring, start, end, -1);
8229 Py_DECREF(substring);
8230 if (result) {
8231 Py_RETURN_TRUE;
8232 }
8233 }
8234 /* nothing matched */
8235 Py_RETURN_FALSE;
8236 }
8237 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008239 return NULL;
8240 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008242 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243}
8244
8245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008246PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008247"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008249Return True if S ends with the specified suffix, False otherwise.\n\
8250With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008251With optional end, stop comparing S at that position.\n\
8252suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253
8254static PyObject *
8255unicode_endswith(PyUnicodeObject *self,
8256 PyObject *args)
8257{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008258 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008260 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008261 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008262 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008264 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8265 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008267 if (PyTuple_Check(subobj)) {
8268 Py_ssize_t i;
8269 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8270 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8271 PyTuple_GET_ITEM(subobj, i));
8272 if (substring == NULL)
8273 return NULL;
8274 result = tailmatch(self, substring, start, end, +1);
8275 Py_DECREF(substring);
8276 if (result) {
8277 Py_RETURN_TRUE;
8278 }
8279 }
8280 Py_RETURN_FALSE;
8281 }
8282 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008286 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008288 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289}
8290
Eric Smith8c663262007-08-25 02:26:07 +00008291#include "stringlib/string_format.h"
8292
8293PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008294"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008295\n\
8296");
8297
Eric Smith4a7d76d2008-05-30 18:10:19 +00008298static PyObject *
8299unicode__format__(PyObject* self, PyObject* args)
8300{
8301 PyObject *format_spec;
8302
8303 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8304 return NULL;
8305
8306 return _PyUnicode_FormatAdvanced(self,
8307 PyUnicode_AS_UNICODE(format_spec),
8308 PyUnicode_GET_SIZE(format_spec));
8309}
8310
Eric Smith8c663262007-08-25 02:26:07 +00008311PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008312"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008313\n\
8314");
8315
8316static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008317unicode__sizeof__(PyUnicodeObject *v)
8318{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008319 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8320 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008321}
8322
8323PyDoc_STRVAR(sizeof__doc__,
8324"S.__sizeof__() -> size of S in memory, in bytes");
8325
8326static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008327unicode_getnewargs(PyUnicodeObject *v)
8328{
8329 return Py_BuildValue("(u#)", v->str, v->length);
8330}
8331
8332
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333static PyMethodDef unicode_methods[] = {
8334
8335 /* Order is according to common usage: often used methods should
8336 appear first, since lookup is done sequentially. */
8337
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008338 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8339 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8340 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008341 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008342 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8343 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8344 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8345 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8346 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8347 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8348 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008349 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008350 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8351 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8352 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008353 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008354 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8355 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8356 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008357 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008358 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008359 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008360 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008361 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8362 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8363 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8364 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8365 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8366 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8367 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8368 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8369 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8370 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8371 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8372 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8373 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8374 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008375 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008376 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008377 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008378 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008379 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008380 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8381 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008382 {"maketrans", (PyCFunction) unicode_maketrans,
8383 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008384 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008385#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008386 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387#endif
8388
8389#if 0
8390 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008391 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392#endif
8393
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008394 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 {NULL, NULL}
8396};
8397
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008398static PyObject *
8399unicode_mod(PyObject *v, PyObject *w)
8400{
8401 if (!PyUnicode_Check(v)) {
8402 Py_INCREF(Py_NotImplemented);
8403 return Py_NotImplemented;
8404 }
8405 return PyUnicode_Format(v, w);
8406}
8407
8408static PyNumberMethods unicode_as_number = {
8409 0, /*nb_add*/
8410 0, /*nb_subtract*/
8411 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008412 unicode_mod, /*nb_remainder*/
8413};
8414
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008416 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008417 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008418 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8419 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008420 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421 0, /* sq_ass_item */
8422 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008423 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424};
8425
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008426static PyObject*
8427unicode_subscript(PyUnicodeObject* self, PyObject* item)
8428{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008429 if (PyIndex_Check(item)) {
8430 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008431 if (i == -1 && PyErr_Occurred())
8432 return NULL;
8433 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008434 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008435 return unicode_getitem(self, i);
8436 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008437 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008438 Py_UNICODE* source_buf;
8439 Py_UNICODE* result_buf;
8440 PyObject* result;
8441
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008442 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008443 &start, &stop, &step, &slicelength) < 0) {
8444 return NULL;
8445 }
8446
8447 if (slicelength <= 0) {
8448 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008449 } else if (start == 0 && step == 1 && slicelength == self->length &&
8450 PyUnicode_CheckExact(self)) {
8451 Py_INCREF(self);
8452 return (PyObject *)self;
8453 } else if (step == 1) {
8454 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008455 } else {
8456 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008457 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8458 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008459
8460 if (result_buf == NULL)
8461 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008462
8463 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8464 result_buf[i] = source_buf[cur];
8465 }
Tim Petersced69f82003-09-16 20:30:58 +00008466
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008467 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008468 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008469 return result;
8470 }
8471 } else {
8472 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8473 return NULL;
8474 }
8475}
8476
8477static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008478 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008479 (binaryfunc)unicode_subscript, /* mp_subscript */
8480 (objobjargproc)0, /* mp_ass_subscript */
8481};
8482
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484/* Helpers for PyUnicode_Format() */
8485
8486static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008487getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008489 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 if (argidx < arglen) {
8491 (*p_argidx)++;
8492 if (arglen < 0)
8493 return args;
8494 else
8495 return PyTuple_GetItem(args, argidx);
8496 }
8497 PyErr_SetString(PyExc_TypeError,
8498 "not enough arguments for format string");
8499 return NULL;
8500}
8501
Martin v. Löwis18e16552006-02-15 17:27:45 +00008502static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008503strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008505 register Py_ssize_t i;
8506 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 for (i = len - 1; i >= 0; i--)
8508 buffer[i] = (Py_UNICODE) charbuffer[i];
8509
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 return len;
8511}
8512
Neal Norwitzfc76d632006-01-10 06:03:13 +00008513static int
8514doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8515{
Tim Peters15231542006-02-16 01:08:01 +00008516 Py_ssize_t result;
8517
Neal Norwitzfc76d632006-01-10 06:03:13 +00008518 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008519 result = strtounicode(buffer, (char *)buffer);
8520 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008521}
8522
Christian Heimes3fd13992008-03-21 01:05:49 +00008523#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008524static int
8525longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8526{
Tim Peters15231542006-02-16 01:08:01 +00008527 Py_ssize_t result;
8528
Neal Norwitzfc76d632006-01-10 06:03:13 +00008529 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008530 result = strtounicode(buffer, (char *)buffer);
8531 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008532}
Christian Heimes3fd13992008-03-21 01:05:49 +00008533#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008534
Guido van Rossum078151d2002-08-11 04:24:12 +00008535/* XXX To save some code duplication, formatfloat/long/int could have been
8536 shared with stringobject.c, converting from 8-bit to Unicode after the
8537 formatting is done. */
8538
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539static int
8540formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008541 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 int flags,
8543 int prec,
8544 int type,
8545 PyObject *v)
8546{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008547 /* fmt = '%#.' + `prec` + `type`
8548 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 char fmt[20];
8550 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008551
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 x = PyFloat_AsDouble(v);
8553 if (x == -1.0 && PyErr_Occurred())
8554 return -1;
8555 if (prec < 0)
8556 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008557 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8558 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008559 /* Worst case length calc to ensure no buffer overrun:
8560
8561 'g' formats:
8562 fmt = %#.<prec>g
8563 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8564 for any double rep.)
8565 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8566
8567 'f' formats:
8568 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8569 len = 1 + 50 + 1 + prec = 52 + prec
8570
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008571 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008572 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008573
8574 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008575 if (((type == 'g' || type == 'G') &&
8576 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008577 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008578 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008579 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008580 return -1;
8581 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008582 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8583 (flags&F_ALT) ? "#" : "",
8584 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008585 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586}
8587
Tim Peters38fd5b62000-09-21 05:43:11 +00008588static PyObject*
8589formatlong(PyObject *val, int flags, int prec, int type)
8590{
8591 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008592 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008593 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008594 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008595
Christian Heimes72b710a2008-05-26 13:28:38 +00008596 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008597 if (!str)
8598 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008599 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008600 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008601 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008602}
8603
Christian Heimes3fd13992008-03-21 01:05:49 +00008604#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605static int
8606formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008607 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 int flags,
8609 int prec,
8610 int type,
8611 PyObject *v)
8612{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008613 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008614 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8615 * + 1 + 1
8616 * = 24
8617 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008618 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008619 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 long x;
8621
Christian Heimes217cfd12007-12-02 14:31:20 +00008622 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008624 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008625 if (x < 0 && type == 'u') {
8626 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008627 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008628 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8629 sign = "-";
8630 else
8631 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008633 prec = 1;
8634
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008635 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8636 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008637 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008638 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008639 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008640 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008641 return -1;
8642 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008643
8644 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008645 (type == 'x' || type == 'X' || type == 'o')) {
8646 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008647 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008648 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008649 * - when 0 is being converted, the C standard leaves off
8650 * the '0x' or '0X', which is inconsistent with other
8651 * %#x/%#X conversions and inconsistent with Python's
8652 * hex() function
8653 * - there are platforms that violate the standard and
8654 * convert 0 with the '0x' or '0X'
8655 * (Metrowerks, Compaq Tru64)
8656 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008657 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008658 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008659 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008660 * We can achieve the desired consistency by inserting our
8661 * own '0x' or '0X' prefix, and substituting %x/%X in place
8662 * of %#x/%#X.
8663 *
8664 * Note that this is the same approach as used in
8665 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008666 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008667 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8668 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008669 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008670 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008671 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8672 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008673 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008674 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008675 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008676 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008677 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008678 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679}
Christian Heimes3fd13992008-03-21 01:05:49 +00008680#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681
8682static int
8683formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008684 size_t buflen,
8685 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008687 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008688 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008689 if (PyUnicode_GET_SIZE(v) == 1) {
8690 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8691 buf[1] = '\0';
8692 return 1;
8693 }
8694#ifndef Py_UNICODE_WIDE
8695 if (PyUnicode_GET_SIZE(v) == 2) {
8696 /* Decode a valid surrogate pair */
8697 int c0 = PyUnicode_AS_UNICODE(v)[0];
8698 int c1 = PyUnicode_AS_UNICODE(v)[1];
8699 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8700 0xDC00 <= c1 && c1 <= 0xDFFF) {
8701 buf[0] = c0;
8702 buf[1] = c1;
8703 buf[2] = '\0';
8704 return 2;
8705 }
8706 }
8707#endif
8708 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 else {
8711 /* Integer input truncated to a character */
8712 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008713 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008715 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008716
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008717 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008718 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008719 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008720 return -1;
8721 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008722
8723#ifndef Py_UNICODE_WIDE
8724 if (x > 0xffff) {
8725 x -= 0x10000;
8726 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8727 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8728 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008729 }
8730#endif
8731 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008732 buf[1] = '\0';
8733 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008735
8736 onError:
8737 PyErr_SetString(PyExc_TypeError,
8738 "%c requires int or char");
8739 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740}
8741
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008742/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8743
8744 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8745 chars are formatted. XXX This is a magic number. Each formatting
8746 routine does bounds checking to ensure no overflow, but a better
8747 solution may be to malloc a buffer of appropriate size for each
8748 format. For now, the current solution is sufficient.
8749*/
8750#define FORMATBUFLEN (size_t)120
8751
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752PyObject *PyUnicode_Format(PyObject *format,
8753 PyObject *args)
8754{
8755 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008756 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 int args_owned = 0;
8758 PyUnicodeObject *result = NULL;
8759 PyObject *dict = NULL;
8760 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008761
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762 if (format == NULL || args == NULL) {
8763 PyErr_BadInternalCall();
8764 return NULL;
8765 }
8766 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008767 if (uformat == NULL)
8768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 fmt = PyUnicode_AS_UNICODE(uformat);
8770 fmtcnt = PyUnicode_GET_SIZE(uformat);
8771
8772 reslen = rescnt = fmtcnt + 100;
8773 result = _PyUnicode_New(reslen);
8774 if (result == NULL)
8775 goto onError;
8776 res = PyUnicode_AS_UNICODE(result);
8777
8778 if (PyTuple_Check(args)) {
8779 arglen = PyTuple_Size(args);
8780 argidx = 0;
8781 }
8782 else {
8783 arglen = -1;
8784 argidx = -2;
8785 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008786 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008787 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 dict = args;
8789
8790 while (--fmtcnt >= 0) {
8791 if (*fmt != '%') {
8792 if (--rescnt < 0) {
8793 rescnt = fmtcnt + 100;
8794 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008795 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008796 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8798 --rescnt;
8799 }
8800 *res++ = *fmt++;
8801 }
8802 else {
8803 /* Got a format specifier */
8804 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008805 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 Py_UNICODE c = '\0';
8808 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008809 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810 PyObject *v = NULL;
8811 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008812 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008814 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008815 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816
8817 fmt++;
8818 if (*fmt == '(') {
8819 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008820 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 PyObject *key;
8822 int pcount = 1;
8823
8824 if (dict == NULL) {
8825 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008826 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 goto onError;
8828 }
8829 ++fmt;
8830 --fmtcnt;
8831 keystart = fmt;
8832 /* Skip over balanced parentheses */
8833 while (pcount > 0 && --fmtcnt >= 0) {
8834 if (*fmt == ')')
8835 --pcount;
8836 else if (*fmt == '(')
8837 ++pcount;
8838 fmt++;
8839 }
8840 keylen = fmt - keystart - 1;
8841 if (fmtcnt < 0 || pcount > 0) {
8842 PyErr_SetString(PyExc_ValueError,
8843 "incomplete format key");
8844 goto onError;
8845 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008846#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008847 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 then looked up since Python uses strings to hold
8849 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008850 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 key = PyUnicode_EncodeUTF8(keystart,
8852 keylen,
8853 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008854#else
8855 key = PyUnicode_FromUnicode(keystart, keylen);
8856#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 if (key == NULL)
8858 goto onError;
8859 if (args_owned) {
8860 Py_DECREF(args);
8861 args_owned = 0;
8862 }
8863 args = PyObject_GetItem(dict, key);
8864 Py_DECREF(key);
8865 if (args == NULL) {
8866 goto onError;
8867 }
8868 args_owned = 1;
8869 arglen = -1;
8870 argidx = -2;
8871 }
8872 while (--fmtcnt >= 0) {
8873 switch (c = *fmt++) {
8874 case '-': flags |= F_LJUST; continue;
8875 case '+': flags |= F_SIGN; continue;
8876 case ' ': flags |= F_BLANK; continue;
8877 case '#': flags |= F_ALT; continue;
8878 case '0': flags |= F_ZERO; continue;
8879 }
8880 break;
8881 }
8882 if (c == '*') {
8883 v = getnextarg(args, arglen, &argidx);
8884 if (v == NULL)
8885 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008886 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 PyErr_SetString(PyExc_TypeError,
8888 "* wants int");
8889 goto onError;
8890 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008891 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008892 if (width == -1 && PyErr_Occurred())
8893 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894 if (width < 0) {
8895 flags |= F_LJUST;
8896 width = -width;
8897 }
8898 if (--fmtcnt >= 0)
8899 c = *fmt++;
8900 }
8901 else if (c >= '0' && c <= '9') {
8902 width = c - '0';
8903 while (--fmtcnt >= 0) {
8904 c = *fmt++;
8905 if (c < '0' || c > '9')
8906 break;
8907 if ((width*10) / 10 != width) {
8908 PyErr_SetString(PyExc_ValueError,
8909 "width too big");
8910 goto onError;
8911 }
8912 width = width*10 + (c - '0');
8913 }
8914 }
8915 if (c == '.') {
8916 prec = 0;
8917 if (--fmtcnt >= 0)
8918 c = *fmt++;
8919 if (c == '*') {
8920 v = getnextarg(args, arglen, &argidx);
8921 if (v == NULL)
8922 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008923 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 PyErr_SetString(PyExc_TypeError,
8925 "* wants int");
8926 goto onError;
8927 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008928 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008929 if (prec == -1 && PyErr_Occurred())
8930 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 if (prec < 0)
8932 prec = 0;
8933 if (--fmtcnt >= 0)
8934 c = *fmt++;
8935 }
8936 else if (c >= '0' && c <= '9') {
8937 prec = c - '0';
8938 while (--fmtcnt >= 0) {
8939 c = Py_CHARMASK(*fmt++);
8940 if (c < '0' || c > '9')
8941 break;
8942 if ((prec*10) / 10 != prec) {
8943 PyErr_SetString(PyExc_ValueError,
8944 "prec too big");
8945 goto onError;
8946 }
8947 prec = prec*10 + (c - '0');
8948 }
8949 }
8950 } /* prec */
8951 if (fmtcnt >= 0) {
8952 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 if (--fmtcnt >= 0)
8954 c = *fmt++;
8955 }
8956 }
8957 if (fmtcnt < 0) {
8958 PyErr_SetString(PyExc_ValueError,
8959 "incomplete format");
8960 goto onError;
8961 }
8962 if (c != '%') {
8963 v = getnextarg(args, arglen, &argidx);
8964 if (v == NULL)
8965 goto onError;
8966 }
8967 sign = 0;
8968 fill = ' ';
8969 switch (c) {
8970
8971 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008972 pbuf = formatbuf;
8973 /* presume that buffer length is at least 1 */
8974 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 len = 1;
8976 break;
8977
8978 case 's':
8979 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00008980 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 if (PyUnicode_Check(v) && c == 's') {
8982 temp = v;
8983 Py_INCREF(temp);
8984 }
8985 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008987 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00008988 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00008990 else
8991 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992 if (temp == NULL)
8993 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008994 if (PyUnicode_Check(temp))
8995 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008996 else {
8997 Py_DECREF(temp);
8998 PyErr_SetString(PyExc_TypeError,
8999 "%s argument has non-string str()");
9000 goto onError;
9001 }
9002 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009003 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 len = PyUnicode_GET_SIZE(temp);
9005 if (prec >= 0 && len > prec)
9006 len = prec;
9007 break;
9008
9009 case 'i':
9010 case 'd':
9011 case 'u':
9012 case 'o':
9013 case 'x':
9014 case 'X':
9015 if (c == 'i')
9016 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009017 isnumok = 0;
9018 if (PyNumber_Check(v)) {
9019 PyObject *iobj=NULL;
9020
9021 if (PyLong_Check(v)) {
9022 iobj = v;
9023 Py_INCREF(iobj);
9024 }
9025 else {
9026 iobj = PyNumber_Long(v);
9027 }
9028 if (iobj!=NULL) {
9029 if (PyLong_Check(iobj)) {
9030 isnumok = 1;
9031 temp = formatlong(iobj, flags, prec, c);
9032 Py_DECREF(iobj);
9033 if (!temp)
9034 goto onError;
9035 pbuf = PyUnicode_AS_UNICODE(temp);
9036 len = PyUnicode_GET_SIZE(temp);
9037 sign = 1;
9038 }
9039 else {
9040 Py_DECREF(iobj);
9041 }
9042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009044 if (!isnumok) {
9045 PyErr_Format(PyExc_TypeError,
9046 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009047 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009048 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009049 }
9050 if (flags & F_ZERO)
9051 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052 break;
9053
9054 case 'e':
9055 case 'E':
9056 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009057 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058 case 'g':
9059 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009060 if (c == 'F')
9061 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009062 pbuf = formatbuf;
9063 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9064 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 if (len < 0)
9066 goto onError;
9067 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009068 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 fill = '0';
9070 break;
9071
9072 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009073 pbuf = formatbuf;
9074 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075 if (len < 0)
9076 goto onError;
9077 break;
9078
9079 default:
9080 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009081 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009082 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009083 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009084 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009085 (Py_ssize_t)(fmt - 1 -
9086 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087 goto onError;
9088 }
9089 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009090 if (*pbuf == '-' || *pbuf == '+') {
9091 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092 len--;
9093 }
9094 else if (flags & F_SIGN)
9095 sign = '+';
9096 else if (flags & F_BLANK)
9097 sign = ' ';
9098 else
9099 sign = 0;
9100 }
9101 if (width < len)
9102 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009103 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 reslen -= rescnt;
9105 rescnt = width + fmtcnt + 100;
9106 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009107 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009108 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009109 PyErr_NoMemory();
9110 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009111 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009112 if (_PyUnicode_Resize(&result, reslen) < 0) {
9113 Py_XDECREF(temp);
9114 goto onError;
9115 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116 res = PyUnicode_AS_UNICODE(result)
9117 + reslen - rescnt;
9118 }
9119 if (sign) {
9120 if (fill != ' ')
9121 *res++ = sign;
9122 rescnt--;
9123 if (width > len)
9124 width--;
9125 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009126 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009127 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009128 assert(pbuf[1] == c);
9129 if (fill != ' ') {
9130 *res++ = *pbuf++;
9131 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009132 }
Tim Petersfff53252001-04-12 18:38:48 +00009133 rescnt -= 2;
9134 width -= 2;
9135 if (width < 0)
9136 width = 0;
9137 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139 if (width > len && !(flags & F_LJUST)) {
9140 do {
9141 --rescnt;
9142 *res++ = fill;
9143 } while (--width > len);
9144 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009145 if (fill == ' ') {
9146 if (sign)
9147 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009148 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009149 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009150 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009151 *res++ = *pbuf++;
9152 *res++ = *pbuf++;
9153 }
9154 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009155 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156 res += len;
9157 rescnt -= len;
9158 while (--width >= len) {
9159 --rescnt;
9160 *res++ = ' ';
9161 }
9162 if (dict && (argidx < arglen) && c != '%') {
9163 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009164 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009165 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166 goto onError;
9167 }
9168 Py_XDECREF(temp);
9169 } /* '%' */
9170 } /* until end */
9171 if (argidx < arglen && !dict) {
9172 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009173 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174 goto onError;
9175 }
9176
Thomas Woutersa96affe2006-03-12 00:29:36 +00009177 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179 if (args_owned) {
9180 Py_DECREF(args);
9181 }
9182 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183 return (PyObject *)result;
9184
9185 onError:
9186 Py_XDECREF(result);
9187 Py_DECREF(uformat);
9188 if (args_owned) {
9189 Py_DECREF(args);
9190 }
9191 return NULL;
9192}
9193
Jeremy Hylton938ace62002-07-17 16:30:39 +00009194static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009195unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9196
Tim Peters6d6c1a32001-08-02 04:15:00 +00009197static PyObject *
9198unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9199{
9200 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009201 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009202 char *encoding = NULL;
9203 char *errors = NULL;
9204
Guido van Rossume023fe02001-08-30 03:12:59 +00009205 if (type != &PyUnicode_Type)
9206 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009207 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009208 kwlist, &x, &encoding, &errors))
9209 return NULL;
9210 if (x == NULL)
9211 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009212 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009213 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009214 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009215 return PyUnicode_FromEncodedObject(x, encoding, errors);
9216}
9217
Guido van Rossume023fe02001-08-30 03:12:59 +00009218static PyObject *
9219unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9220{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009221 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009222 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009223
9224 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9225 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9226 if (tmp == NULL)
9227 return NULL;
9228 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009229 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009230 if (pnew == NULL) {
9231 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009232 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009233 }
Christian Heimesb186d002008-03-18 15:15:01 +00009234 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009235 if (pnew->str == NULL) {
9236 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009237 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009238 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009239 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009240 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009241 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9242 pnew->length = n;
9243 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009244 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009245 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009246}
9247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009248PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009249"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009250\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009251Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009252encoding defaults to the current default string encoding.\n\
9253errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009254
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009255static PyObject *unicode_iter(PyObject *seq);
9256
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009258 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009259 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260 sizeof(PyUnicodeObject), /* tp_size */
9261 0, /* tp_itemsize */
9262 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009263 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009265 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009267 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009268 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009269 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009271 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009272 (hashfunc) unicode_hash, /* tp_hash*/
9273 0, /* tp_call*/
9274 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009275 PyObject_GenericGetAttr, /* tp_getattro */
9276 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009277 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009278 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9279 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009280 unicode_doc, /* tp_doc */
9281 0, /* tp_traverse */
9282 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009283 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009284 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009285 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009286 0, /* tp_iternext */
9287 unicode_methods, /* tp_methods */
9288 0, /* tp_members */
9289 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009290 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009291 0, /* tp_dict */
9292 0, /* tp_descr_get */
9293 0, /* tp_descr_set */
9294 0, /* tp_dictoffset */
9295 0, /* tp_init */
9296 0, /* tp_alloc */
9297 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009298 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299};
9300
9301/* Initialize the Unicode implementation */
9302
Thomas Wouters78890102000-07-22 19:25:51 +00009303void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009305 int i;
9306
Thomas Wouters477c8d52006-05-27 19:21:47 +00009307 /* XXX - move this array to unicodectype.c ? */
9308 Py_UNICODE linebreak[] = {
9309 0x000A, /* LINE FEED */
9310 0x000D, /* CARRIAGE RETURN */
9311 0x001C, /* FILE SEPARATOR */
9312 0x001D, /* GROUP SEPARATOR */
9313 0x001E, /* RECORD SEPARATOR */
9314 0x0085, /* NEXT LINE */
9315 0x2028, /* LINE SEPARATOR */
9316 0x2029, /* PARAGRAPH SEPARATOR */
9317 };
9318
Fred Drakee4315f52000-05-09 19:53:39 +00009319 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009320 free_list = NULL;
9321 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009323 if (!unicode_empty)
9324 return;
9325
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009326 for (i = 0; i < 256; i++)
9327 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009328 if (PyType_Ready(&PyUnicode_Type) < 0)
9329 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009330
9331 /* initialize the linebreak bloom filter */
9332 bloom_linebreak = make_bloom_mask(
9333 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9334 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009335
9336 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337}
9338
9339/* Finalize the Unicode implementation */
9340
Christian Heimesa156e092008-02-16 07:38:31 +00009341int
9342PyUnicode_ClearFreeList(void)
9343{
9344 int freelist_size = numfree;
9345 PyUnicodeObject *u;
9346
9347 for (u = free_list; u != NULL;) {
9348 PyUnicodeObject *v = u;
9349 u = *(PyUnicodeObject **)u;
9350 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009351 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009352 Py_XDECREF(v->defenc);
9353 PyObject_Del(v);
9354 numfree--;
9355 }
9356 free_list = NULL;
9357 assert(numfree == 0);
9358 return freelist_size;
9359}
9360
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361void
Thomas Wouters78890102000-07-22 19:25:51 +00009362_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009364 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009366 Py_XDECREF(unicode_empty);
9367 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009368
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009369 for (i = 0; i < 256; i++) {
9370 if (unicode_latin1[i]) {
9371 Py_DECREF(unicode_latin1[i]);
9372 unicode_latin1[i] = NULL;
9373 }
9374 }
Christian Heimesa156e092008-02-16 07:38:31 +00009375 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009377
Walter Dörwald16807132007-05-25 13:52:07 +00009378void
9379PyUnicode_InternInPlace(PyObject **p)
9380{
9381 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9382 PyObject *t;
9383 if (s == NULL || !PyUnicode_Check(s))
9384 Py_FatalError(
9385 "PyUnicode_InternInPlace: unicode strings only please!");
9386 /* If it's a subclass, we don't really know what putting
9387 it in the interned dict might do. */
9388 if (!PyUnicode_CheckExact(s))
9389 return;
9390 if (PyUnicode_CHECK_INTERNED(s))
9391 return;
9392 if (interned == NULL) {
9393 interned = PyDict_New();
9394 if (interned == NULL) {
9395 PyErr_Clear(); /* Don't leave an exception */
9396 return;
9397 }
9398 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009399 /* It might be that the GetItem call fails even
9400 though the key is present in the dictionary,
9401 namely when this happens during a stack overflow. */
9402 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009403 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009404 Py_END_ALLOW_RECURSION
9405
Walter Dörwald16807132007-05-25 13:52:07 +00009406 if (t) {
9407 Py_INCREF(t);
9408 Py_DECREF(*p);
9409 *p = t;
9410 return;
9411 }
9412
Martin v. Löwis5b222132007-06-10 09:51:05 +00009413 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009414 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9415 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009416 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009417 return;
9418 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009419 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009420 /* The two references in interned are not counted by refcnt.
9421 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009422 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009423 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9424}
9425
9426void
9427PyUnicode_InternImmortal(PyObject **p)
9428{
9429 PyUnicode_InternInPlace(p);
9430 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9431 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9432 Py_INCREF(*p);
9433 }
9434}
9435
9436PyObject *
9437PyUnicode_InternFromString(const char *cp)
9438{
9439 PyObject *s = PyUnicode_FromString(cp);
9440 if (s == NULL)
9441 return NULL;
9442 PyUnicode_InternInPlace(&s);
9443 return s;
9444}
9445
9446void _Py_ReleaseInternedUnicodeStrings(void)
9447{
9448 PyObject *keys;
9449 PyUnicodeObject *s;
9450 Py_ssize_t i, n;
9451 Py_ssize_t immortal_size = 0, mortal_size = 0;
9452
9453 if (interned == NULL || !PyDict_Check(interned))
9454 return;
9455 keys = PyDict_Keys(interned);
9456 if (keys == NULL || !PyList_Check(keys)) {
9457 PyErr_Clear();
9458 return;
9459 }
9460
9461 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9462 detector, interned unicode strings are not forcibly deallocated;
9463 rather, we give them their stolen references back, and then clear
9464 and DECREF the interned dict. */
9465
9466 n = PyList_GET_SIZE(keys);
9467 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9468 n);
9469 for (i = 0; i < n; i++) {
9470 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9471 switch (s->state) {
9472 case SSTATE_NOT_INTERNED:
9473 /* XXX Shouldn't happen */
9474 break;
9475 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009476 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009477 immortal_size += s->length;
9478 break;
9479 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009480 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009481 mortal_size += s->length;
9482 break;
9483 default:
9484 Py_FatalError("Inconsistent interned string state.");
9485 }
9486 s->state = SSTATE_NOT_INTERNED;
9487 }
9488 fprintf(stderr, "total size of all interned strings: "
9489 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9490 "mortal/immortal\n", mortal_size, immortal_size);
9491 Py_DECREF(keys);
9492 PyDict_Clear(interned);
9493 Py_DECREF(interned);
9494 interned = NULL;
9495}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009496
9497
9498/********************* Unicode Iterator **************************/
9499
9500typedef struct {
9501 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009502 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009503 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9504} unicodeiterobject;
9505
9506static void
9507unicodeiter_dealloc(unicodeiterobject *it)
9508{
9509 _PyObject_GC_UNTRACK(it);
9510 Py_XDECREF(it->it_seq);
9511 PyObject_GC_Del(it);
9512}
9513
9514static int
9515unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9516{
9517 Py_VISIT(it->it_seq);
9518 return 0;
9519}
9520
9521static PyObject *
9522unicodeiter_next(unicodeiterobject *it)
9523{
9524 PyUnicodeObject *seq;
9525 PyObject *item;
9526
9527 assert(it != NULL);
9528 seq = it->it_seq;
9529 if (seq == NULL)
9530 return NULL;
9531 assert(PyUnicode_Check(seq));
9532
9533 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009534 item = PyUnicode_FromUnicode(
9535 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009536 if (item != NULL)
9537 ++it->it_index;
9538 return item;
9539 }
9540
9541 Py_DECREF(seq);
9542 it->it_seq = NULL;
9543 return NULL;
9544}
9545
9546static PyObject *
9547unicodeiter_len(unicodeiterobject *it)
9548{
9549 Py_ssize_t len = 0;
9550 if (it->it_seq)
9551 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009552 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009553}
9554
9555PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9556
9557static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009558 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9559 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009560 {NULL, NULL} /* sentinel */
9561};
9562
9563PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009564 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009565 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009566 sizeof(unicodeiterobject), /* tp_basicsize */
9567 0, /* tp_itemsize */
9568 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009569 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009570 0, /* tp_print */
9571 0, /* tp_getattr */
9572 0, /* tp_setattr */
9573 0, /* tp_compare */
9574 0, /* tp_repr */
9575 0, /* tp_as_number */
9576 0, /* tp_as_sequence */
9577 0, /* tp_as_mapping */
9578 0, /* tp_hash */
9579 0, /* tp_call */
9580 0, /* tp_str */
9581 PyObject_GenericGetAttr, /* tp_getattro */
9582 0, /* tp_setattro */
9583 0, /* tp_as_buffer */
9584 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9585 0, /* tp_doc */
9586 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9587 0, /* tp_clear */
9588 0, /* tp_richcompare */
9589 0, /* tp_weaklistoffset */
9590 PyObject_SelfIter, /* tp_iter */
9591 (iternextfunc)unicodeiter_next, /* tp_iternext */
9592 unicodeiter_methods, /* tp_methods */
9593 0,
9594};
9595
9596static PyObject *
9597unicode_iter(PyObject *seq)
9598{
9599 unicodeiterobject *it;
9600
9601 if (!PyUnicode_Check(seq)) {
9602 PyErr_BadInternalCall();
9603 return NULL;
9604 }
9605 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9606 if (it == NULL)
9607 return NULL;
9608 it->it_index = 0;
9609 Py_INCREF(seq);
9610 it->it_seq = (PyUnicodeObject *)seq;
9611 _PyObject_GC_TRACK(it);
9612 return (PyObject *)it;
9613}
9614
Martin v. Löwis5b222132007-06-10 09:51:05 +00009615size_t
9616Py_UNICODE_strlen(const Py_UNICODE *u)
9617{
9618 int res = 0;
9619 while(*u++)
9620 res++;
9621 return res;
9622}
9623
9624Py_UNICODE*
9625Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9626{
9627 Py_UNICODE *u = s1;
9628 while ((*u++ = *s2++));
9629 return s1;
9630}
9631
9632Py_UNICODE*
9633Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9634{
9635 Py_UNICODE *u = s1;
9636 while ((*u++ = *s2++))
9637 if (n-- == 0)
9638 break;
9639 return s1;
9640}
9641
9642int
9643Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9644{
9645 while (*s1 && *s2 && *s1 == *s2)
9646 s1++, s2++;
9647 if (*s1 && *s2)
9648 return (*s1 < *s2) ? -1 : +1;
9649 if (*s1)
9650 return 1;
9651 if (*s2)
9652 return -1;
9653 return 0;
9654}
9655
9656Py_UNICODE*
9657Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9658{
9659 const Py_UNICODE *p;
9660 for (p = s; *p; p++)
9661 if (*p == c)
9662 return (Py_UNICODE*)p;
9663 return NULL;
9664}
9665
9666
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009667#ifdef __cplusplus
9668}
9669#endif
9670
9671
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009672/*
9673Local variables:
9674c-basic-offset: 4
9675indent-tabs-mode: nil
9676End:
9677*/