blob: 38c3385ae829abb7a3f189774b064bce535a78eb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Christian Heimes190d79e2008-01-30 11:58:22 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* 0x000A, * LINE FEED */
161/* 0x000D, * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x001C, * FILE SEPARATOR */
165/* 0x001D, * GROUP SEPARATOR */
166/* 0x001E, * RECORD SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000330 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
349 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
379 case SSTATE_NOT_INTERNED:
380 break;
381
382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000384 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000387 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000388 break;
389
390 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
393 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000401 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 unicode->str = NULL;
403 unicode->length = 0;
404 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000415 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000416 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Martin v. Löwis18e16552006-02-15 17:27:45 +0000421int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422{
423 register PyUnicodeObject *v;
424
425 /* Argument checks */
426 if (unicode == NULL) {
427 PyErr_BadInternalCall();
428 return -1;
429 }
430 v = (PyUnicodeObject *)*unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000431 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000432 PyErr_BadInternalCall();
433 return -1;
434 }
435
436 /* Resizing unicode_empty and single character objects is not
437 possible since these are being shared. We simply return a fresh
438 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000439 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 (v == unicode_empty || v->length == 1)) {
441 PyUnicodeObject *w = _PyUnicode_New(length);
442 if (w == NULL)
443 return -1;
444 Py_UNICODE_COPY(w->str, v->str,
445 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000446 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 *unicode = (PyObject *)w;
448 return 0;
449 }
450
451 /* Note that we don't have to modify *unicode for unshared Unicode
452 objects, since we can modify them in-place. */
453 return unicode_resize(v, length);
454}
455
456/* Internal API for use in unicodeobject.c only ! */
457#define _PyUnicode_Resize(unicodevar, length) \
458 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
459
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000461 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462{
463 PyUnicodeObject *unicode;
464
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000465 /* If the Unicode data is known at construction time, we can apply
466 some optimizations which share commonly used objects. */
467 if (u != NULL) {
468
469 /* Optimization for empty strings */
470 if (size == 0 && unicode_empty != NULL) {
471 Py_INCREF(unicode_empty);
472 return (PyObject *)unicode_empty;
473 }
474
475 /* Single character Unicode objects in the Latin-1 range are
476 shared when using this constructor */
477 if (size == 1 && *u < 256) {
478 unicode = unicode_latin1[*u];
479 if (!unicode) {
480 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000481 if (!unicode)
482 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000483 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000484 unicode_latin1[*u] = unicode;
485 }
486 Py_INCREF(unicode);
487 return (PyObject *)unicode;
488 }
489 }
Tim Petersced69f82003-09-16 20:30:58 +0000490
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 unicode = _PyUnicode_New(size);
492 if (!unicode)
493 return NULL;
494
495 /* Copy the Unicode data into the new object */
496 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000497 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498
499 return (PyObject *)unicode;
500}
501
Walter Dörwaldd2034312007-05-18 16:29:38 +0000502PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000503{
504 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000505
506 if (size < 0) {
507 PyErr_SetString(PyExc_SystemError,
508 "Negative size passed to PyUnicode_FromStringAndSize");
509 return NULL;
510 }
511
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000512 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000513 some optimizations which share commonly used objects.
514 Also, this means the input must be UTF-8, so fall back to the
515 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000516 if (u != NULL) {
517
518 /* Optimization for empty strings */
519 if (size == 0 && unicode_empty != NULL) {
520 Py_INCREF(unicode_empty);
521 return (PyObject *)unicode_empty;
522 }
523
Martin v. Löwis9c121062007-08-05 20:26:11 +0000524 /* Single characters are shared when using this constructor.
525 Restrict to ASCII, since the input must be UTF-8. */
526 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000527 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (!unicode) {
529 unicode = _PyUnicode_New(1);
530 if (!unicode)
531 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000532 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000533 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 }
535 Py_INCREF(unicode);
536 return (PyObject *)unicode;
537 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000538
539 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000540 }
541
Walter Dörwald55507312007-05-18 13:12:10 +0000542 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000543 if (!unicode)
544 return NULL;
545
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000546 return (PyObject *)unicode;
547}
548
Walter Dörwaldd2034312007-05-18 16:29:38 +0000549PyObject *PyUnicode_FromString(const char *u)
550{
551 size_t size = strlen(u);
552 if (size > PY_SSIZE_T_MAX) {
553 PyErr_SetString(PyExc_OverflowError, "input too long");
554 return NULL;
555 }
556
557 return PyUnicode_FromStringAndSize(u, size);
558}
559
Guido van Rossumd57fd912000-03-10 22:53:23 +0000560#ifdef HAVE_WCHAR_H
561
562PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000563 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564{
565 PyUnicodeObject *unicode;
566
567 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000568 if (size == 0)
569 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 PyErr_BadInternalCall();
571 return NULL;
572 }
573
Martin v. Löwis790465f2008-04-05 20:41:37 +0000574 if (size == -1) {
575 size = wcslen(w);
576 }
577
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578 unicode = _PyUnicode_New(size);
579 if (!unicode)
580 return NULL;
581
582 /* Copy the wchar_t data into the new object */
583#ifdef HAVE_USABLE_WCHAR_T
584 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000585#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 {
587 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000588 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000590 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 *u++ = *w++;
592 }
593#endif
594
595 return (PyObject *)unicode;
596}
597
Walter Dörwald346737f2007-05-31 10:44:43 +0000598static void
599makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
600{
601 *fmt++ = '%';
602 if (width) {
603 if (zeropad)
604 *fmt++ = '0';
605 fmt += sprintf(fmt, "%d", width);
606 }
607 if (precision)
608 fmt += sprintf(fmt, ".%d", precision);
609 if (longflag)
610 *fmt++ = 'l';
611 else if (size_tflag) {
612 char *f = PY_FORMAT_SIZE_T;
613 while (*f)
614 *fmt++ = *f++;
615 }
616 *fmt++ = c;
617 *fmt = '\0';
618}
619
Walter Dörwaldd2034312007-05-18 16:29:38 +0000620#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
621
622PyObject *
623PyUnicode_FromFormatV(const char *format, va_list vargs)
624{
625 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000626 Py_ssize_t callcount = 0;
627 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000628 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000629 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000630 int width = 0;
631 int precision = 0;
632 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000633 const char* f;
634 Py_UNICODE *s;
635 PyObject *string;
636 /* used by sprintf */
637 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000638 /* use abuffer instead of buffer, if we need more space
639 * (which can happen if there's a format specifier with width). */
640 char *abuffer = NULL;
641 char *realbuffer;
642 Py_ssize_t abuffersize = 0;
643 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000644 const char *copy;
645
646#ifdef VA_LIST_IS_ARRAY
647 Py_MEMCPY(count, vargs, sizeof(va_list));
648#else
649#ifdef __va_copy
650 __va_copy(count, vargs);
651#else
652 count = vargs;
653#endif
654#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000655 /* step 1: count the number of %S/%R/%A format specifications
656 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
657 * these objects once during step 3 and put the result in
658 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000660 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 ++callcount;
662 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000663 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000664 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000665 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000666 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000667 if (!callresults) {
668 PyErr_NoMemory();
669 return NULL;
670 }
671 callresult = callresults;
672 }
673 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000674 for (f = format; *f; f++) {
675 if (*f == '%') {
676 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000677 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000678 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000679 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000680 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000681 ;
682
683 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
684 * they don't affect the amount of space we reserve.
685 */
686 if ((*f == 'l' || *f == 'z') &&
687 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000688 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000689
690 switch (*f) {
691 case 'c':
692 (void)va_arg(count, int);
693 /* fall through... */
694 case '%':
695 n++;
696 break;
697 case 'd': case 'u': case 'i': case 'x':
698 (void) va_arg(count, int);
699 /* 20 bytes is enough to hold a 64-bit
700 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000701 This isn't enough for octal.
702 If a width is specified we need more
703 (which we allocate later). */
704 if (width < 20)
705 width = 20;
706 n += width;
707 if (abuffersize < width)
708 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 break;
710 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000711 {
712 /* UTF-8 */
713 unsigned char*s;
714 s = va_arg(count, unsigned char*);
715 while (*s) {
716 if (*s < 128) {
717 n++; s++;
718 } else if (*s < 0xc0) {
719 /* invalid UTF-8 */
720 n++; s++;
721 } else if (*s < 0xc0) {
722 n++;
723 s++; if(!*s)break;
724 s++;
725 } else if (*s < 0xe0) {
726 n++;
727 s++; if(!*s)break;
728 s++; if(!*s)break;
729 s++;
730 } else {
731 #ifdef Py_UNICODE_WIDE
732 n++;
733 #else
734 n+=2;
735 #endif
736 s++; if(!*s)break;
737 s++; if(!*s)break;
738 s++; if(!*s)break;
739 s++;
740 }
741 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000742 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000743 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 case 'U':
745 {
746 PyObject *obj = va_arg(count, PyObject *);
747 assert(obj && PyUnicode_Check(obj));
748 n += PyUnicode_GET_SIZE(obj);
749 break;
750 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000751 case 'V':
752 {
753 PyObject *obj = va_arg(count, PyObject *);
754 const char *str = va_arg(count, const char *);
755 assert(obj || str);
756 assert(!obj || PyUnicode_Check(obj));
757 if (obj)
758 n += PyUnicode_GET_SIZE(obj);
759 else
760 n += strlen(str);
761 break;
762 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000763 case 'S':
764 {
765 PyObject *obj = va_arg(count, PyObject *);
766 PyObject *str;
767 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000768 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000769 if (!str)
770 goto fail;
771 n += PyUnicode_GET_SIZE(str);
772 /* Remember the str and switch to the next slot */
773 *callresult++ = str;
774 break;
775 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000776 case 'R':
777 {
778 PyObject *obj = va_arg(count, PyObject *);
779 PyObject *repr;
780 assert(obj);
781 repr = PyObject_Repr(obj);
782 if (!repr)
783 goto fail;
784 n += PyUnicode_GET_SIZE(repr);
785 /* Remember the repr and switch to the next slot */
786 *callresult++ = repr;
787 break;
788 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000789 case 'A':
790 {
791 PyObject *obj = va_arg(count, PyObject *);
792 PyObject *ascii;
793 assert(obj);
794 ascii = PyObject_ASCII(obj);
795 if (!ascii)
796 goto fail;
797 n += PyUnicode_GET_SIZE(ascii);
798 /* Remember the repr and switch to the next slot */
799 *callresult++ = ascii;
800 break;
801 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 case 'p':
803 (void) va_arg(count, int);
804 /* maximum 64-bit pointer representation:
805 * 0xffffffffffffffff
806 * so 19 characters is enough.
807 * XXX I count 18 -- what's the extra for?
808 */
809 n += 19;
810 break;
811 default:
812 /* if we stumble upon an unknown
813 formatting code, copy the rest of
814 the format string to the output
815 string. (we cannot just skip the
816 code, since there's no way to know
817 what's in the argument list) */
818 n += strlen(p);
819 goto expand;
820 }
821 } else
822 n++;
823 }
824 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000825 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000826 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000827 if (!abuffer) {
828 PyErr_NoMemory();
829 goto fail;
830 }
831 realbuffer = abuffer;
832 }
833 else
834 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000835 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000837 we don't have to resize the string.
838 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000839 string = PyUnicode_FromUnicode(NULL, n);
840 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000841 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000842
843 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000845
846 for (f = format; *f; f++) {
847 if (*f == '%') {
848 const char* p = f++;
849 int longflag = 0;
850 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000851 zeropad = (*f == '0');
852 /* parse the width.precision part */
853 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000854 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000855 width = (width*10) + *f++ - '0';
856 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000857 if (*f == '.') {
858 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000859 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862 /* handle the long flag, but only for %ld and %lu.
863 others can be added when necessary. */
864 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
865 longflag = 1;
866 ++f;
867 }
868 /* handle the size_t flag. */
869 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
870 size_tflag = 1;
871 ++f;
872 }
873
874 switch (*f) {
875 case 'c':
876 *s++ = va_arg(vargs, int);
877 break;
878 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000879 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000880 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000881 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000884 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 sprintf(realbuffer, fmt, va_arg(vargs, int));
886 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000887 break;
888 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000889 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000890 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000893 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000894 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
896 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 break;
898 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000899 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
900 sprintf(realbuffer, fmt, va_arg(vargs, int));
901 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000902 break;
903 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000904 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
905 sprintf(realbuffer, fmt, va_arg(vargs, int));
906 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000907 break;
908 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000909 {
910 /* Parameter must be UTF-8 encoded.
911 In case of encoding errors, use
912 the replacement character. */
913 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000914 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000915 u = PyUnicode_DecodeUTF8(p, strlen(p),
916 "replace");
917 if (!u)
918 goto fail;
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
920 PyUnicode_GET_SIZE(u));
921 s += PyUnicode_GET_SIZE(u);
922 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000923 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000924 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000925 case 'U':
926 {
927 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000928 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
929 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
930 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000931 break;
932 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000933 case 'V':
934 {
935 PyObject *obj = va_arg(vargs, PyObject *);
936 const char *str = va_arg(vargs, const char *);
937 if (obj) {
938 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
939 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
940 s += size;
941 } else {
942 appendstring(str);
943 }
944 break;
945 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000946 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000947 case 'R':
948 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000949 Py_UNICODE *ucopy;
950 Py_ssize_t usize;
951 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000952 /* unused, since we already have the result */
953 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000954 ucopy = PyUnicode_AS_UNICODE(*callresult);
955 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000956 for (upos = 0; upos<usize;)
957 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000958 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000959 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000960 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000961 ++callresult;
962 break;
963 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000964 case 'p':
965 sprintf(buffer, "%p", va_arg(vargs, void*));
966 /* %p is ill-defined: ensure leading 0x. */
967 if (buffer[1] == 'X')
968 buffer[1] = 'x';
969 else if (buffer[1] != 'x') {
970 memmove(buffer+2, buffer, strlen(buffer)+1);
971 buffer[0] = '0';
972 buffer[1] = 'x';
973 }
974 appendstring(buffer);
975 break;
976 case '%':
977 *s++ = '%';
978 break;
979 default:
980 appendstring(p);
981 goto end;
982 }
983 } else
984 *s++ = *f;
985 }
986
987 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000988 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000989 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000990 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000991 PyObject_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000992 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
993 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000994 fail:
995 if (callresults) {
996 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000997 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000998 Py_DECREF(*callresult2);
999 ++callresult2;
1000 }
Christian Heimesb186d002008-03-18 15:15:01 +00001001 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001002 }
Walter Dörwald346737f2007-05-31 10:44:43 +00001003 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +00001004 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001005 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001006}
1007
1008#undef appendstring
1009
1010PyObject *
1011PyUnicode_FromFormat(const char *format, ...)
1012{
1013 PyObject* ret;
1014 va_list vargs;
1015
1016#ifdef HAVE_STDARG_PROTOTYPES
1017 va_start(vargs, format);
1018#else
1019 va_start(vargs);
1020#endif
1021 ret = PyUnicode_FromFormatV(format, vargs);
1022 va_end(vargs);
1023 return ret;
1024}
1025
Martin v. Löwis18e16552006-02-15 17:27:45 +00001026Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1027 wchar_t *w,
1028 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029{
1030 if (unicode == NULL) {
1031 PyErr_BadInternalCall();
1032 return -1;
1033 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001034
1035 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001037 size = PyUnicode_GET_SIZE(unicode) + 1;
1038
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039#ifdef HAVE_USABLE_WCHAR_T
1040 memcpy(w, unicode->str, size * sizeof(wchar_t));
1041#else
1042 {
1043 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001044 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001046 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 *w++ = *u++;
1048 }
1049#endif
1050
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001051 if (size > PyUnicode_GET_SIZE(unicode))
1052 return PyUnicode_GET_SIZE(unicode);
1053 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 return size;
1055}
1056
1057#endif
1058
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001059PyObject *PyUnicode_FromOrdinal(int ordinal)
1060{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001061 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001063 if (ordinal < 0 || ordinal > 0x10ffff) {
1064 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001065 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001066 return NULL;
1067 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001068
1069#ifndef Py_UNICODE_WIDE
1070 if (ordinal > 0xffff) {
1071 ordinal -= 0x10000;
1072 s[0] = 0xD800 | (ordinal >> 10);
1073 s[1] = 0xDC00 | (ordinal & 0x3FF);
1074 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001075 }
1076#endif
1077
Hye-Shik Chang40574832004-04-06 07:24:51 +00001078 s[0] = (Py_UNICODE)ordinal;
1079 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001080}
1081
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082PyObject *PyUnicode_FromObject(register PyObject *obj)
1083{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001084 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001085 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 if (PyUnicode_CheckExact(obj)) {
1087 Py_INCREF(obj);
1088 return obj;
1089 }
1090 if (PyUnicode_Check(obj)) {
1091 /* For a Unicode subtype that's not a Unicode object,
1092 return a true Unicode object with the same data. */
1093 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1094 PyUnicode_GET_SIZE(obj));
1095 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001096 PyErr_Format(PyExc_TypeError,
1097 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001098 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001099 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100}
1101
1102PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1103 const char *encoding,
1104 const char *errors)
1105{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001106 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001107 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001109
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (obj == NULL) {
1111 PyErr_BadInternalCall();
1112 return NULL;
1113 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001114
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001115 if (PyUnicode_Check(obj)) {
1116 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001117 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001118 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001119 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001120
1121 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001122 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001123 s = PyBytes_AS_STRING(obj);
1124 len = PyBytes_GET_SIZE(obj);
1125 }
1126 else if (PyByteArray_Check(obj)) {
1127 s = PyByteArray_AS_STRING(obj);
1128 len = PyByteArray_GET_SIZE(obj);
1129 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001130 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1131 /* Overwrite the error message with something more useful in
1132 case of a TypeError. */
1133 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001134 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001135 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001136 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001137 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001138 goto onError;
1139 }
Tim Petersced69f82003-09-16 20:30:58 +00001140
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001141 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 if (len == 0) {
1143 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001144 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145 }
Tim Petersced69f82003-09-16 20:30:58 +00001146 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001147 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001148
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001149 return v;
1150
1151 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153}
1154
1155PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001156 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 const char *encoding,
1158 const char *errors)
1159{
1160 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001161 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001162 char lower[20]; /* Enough for any encoding name we recognize */
1163 char *l;
1164 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001165
1166 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001167 encoding = PyUnicode_GetDefaultEncoding();
1168
1169 /* Convert encoding to lower case and replace '_' with '-' in order to
1170 catch e.g. UTF_8 */
1171 e = encoding;
1172 l = lower;
1173 while (*e && l < &lower[(sizeof lower) - 2]) {
1174 if (ISUPPER(*e)) {
1175 *l++ = TOLOWER(*e++);
1176 }
1177 else if (*e == '_') {
1178 *l++ = '-';
1179 e++;
1180 }
1181 else {
1182 *l++ = *e++;
1183 }
1184 }
1185 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001186
1187 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001188 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 else if ((strcmp(lower, "latin-1") == 0) ||
1191 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001192 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001194 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195 return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001197 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001198 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001199 else if (strcmp(lower, "utf-16") == 0)
1200 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1201 else if (strcmp(lower, "utf-32") == 0)
1202 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203
1204 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001205 buffer = NULL;
Martin v. Löwis423be952008-08-13 15:53:07 +00001206 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001207 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001208 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 if (buffer == NULL)
1210 goto onError;
1211 unicode = PyCodec_Decode(buffer, encoding, errors);
1212 if (unicode == NULL)
1213 goto onError;
1214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001216 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001217 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 Py_DECREF(unicode);
1219 goto onError;
1220 }
1221 Py_DECREF(buffer);
1222 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 onError:
1225 Py_XDECREF(buffer);
1226 return NULL;
1227}
1228
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001229PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1230 const char *encoding,
1231 const char *errors)
1232{
1233 PyObject *v;
1234
1235 if (!PyUnicode_Check(unicode)) {
1236 PyErr_BadArgument();
1237 goto onError;
1238 }
1239
1240 if (encoding == NULL)
1241 encoding = PyUnicode_GetDefaultEncoding();
1242
1243 /* Decode via the codec registry */
1244 v = PyCodec_Decode(unicode, encoding, errors);
1245 if (v == NULL)
1246 goto onError;
1247 return v;
1248
1249 onError:
1250 return NULL;
1251}
1252
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001253PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1254 const char *encoding,
1255 const char *errors)
1256{
1257 PyObject *v;
1258
1259 if (!PyUnicode_Check(unicode)) {
1260 PyErr_BadArgument();
1261 goto onError;
1262 }
1263
1264 if (encoding == NULL)
1265 encoding = PyUnicode_GetDefaultEncoding();
1266
1267 /* Decode via the codec registry */
1268 v = PyCodec_Decode(unicode, encoding, errors);
1269 if (v == NULL)
1270 goto onError;
1271 if (!PyUnicode_Check(v)) {
1272 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001273 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001274 Py_TYPE(v)->tp_name);
1275 Py_DECREF(v);
1276 goto onError;
1277 }
1278 return v;
1279
1280 onError:
1281 return NULL;
1282}
1283
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001285 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 const char *encoding,
1287 const char *errors)
1288{
1289 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 unicode = PyUnicode_FromUnicode(s, size);
1292 if (unicode == NULL)
1293 return NULL;
1294 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1295 Py_DECREF(unicode);
1296 return v;
1297}
1298
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001299PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1300 const char *encoding,
1301 const char *errors)
1302{
1303 PyObject *v;
1304
1305 if (!PyUnicode_Check(unicode)) {
1306 PyErr_BadArgument();
1307 goto onError;
1308 }
1309
1310 if (encoding == NULL)
1311 encoding = PyUnicode_GetDefaultEncoding();
1312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
1317 return v;
1318
1319 onError:
1320 return NULL;
1321}
1322
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1324 const char *encoding,
1325 const char *errors)
1326{
1327 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 if (!PyUnicode_Check(unicode)) {
1330 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332 }
Fred Drakee4315f52000-05-09 19:53:39 +00001333
Tim Petersced69f82003-09-16 20:30:58 +00001334 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001335 encoding = PyUnicode_GetDefaultEncoding();
1336
1337 /* Shortcuts for common default encodings */
1338 if (errors == NULL) {
1339 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001340 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001341 else if (strcmp(encoding, "latin-1") == 0)
1342 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001343#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1344 else if (strcmp(encoding, "mbcs") == 0)
1345 return PyUnicode_AsMBCSString(unicode);
1346#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001347 else if (strcmp(encoding, "ascii") == 0)
1348 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001349 /* During bootstrap, we may need to find the encodings
1350 package, to load the file system encoding, and require the
1351 file system encoding in order to load the encodings
1352 package.
1353
1354 Break out of this dependency by assuming that the path to
1355 the encodings module is ASCII-only. XXX could try wcstombs
1356 instead, if the file system encoding is the locale's
1357 encoding. */
1358 else if (Py_FileSystemDefaultEncoding &&
1359 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1360 !PyThreadState_GET()->interp->codecs_initialized)
1361 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363
1364 /* Encode via the codec registry */
1365 v = PyCodec_Encode(unicode, encoding, errors);
1366 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001367 return NULL;
1368
1369 /* The normal path */
1370 if (PyBytes_Check(v))
1371 return v;
1372
1373 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001374 if (PyByteArray_Check(v)) {
1375 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001376 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001377 PyOS_snprintf(msg, sizeof(msg),
1378 "encoder %s returned buffer instead of bytes",
1379 encoding);
1380 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001381 Py_DECREF(v);
1382 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001383 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001384
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001385 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1386 Py_DECREF(v);
1387 return b;
1388 }
1389
1390 PyErr_Format(PyExc_TypeError,
1391 "encoder did not return a bytes object (type=%.400s)",
1392 Py_TYPE(v)->tp_name);
1393 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001394 return NULL;
1395}
1396
1397PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1398 const char *encoding,
1399 const char *errors)
1400{
1401 PyObject *v;
1402
1403 if (!PyUnicode_Check(unicode)) {
1404 PyErr_BadArgument();
1405 goto onError;
1406 }
1407
1408 if (encoding == NULL)
1409 encoding = PyUnicode_GetDefaultEncoding();
1410
1411 /* Encode via the codec registry */
1412 v = PyCodec_Encode(unicode, encoding, errors);
1413 if (v == NULL)
1414 goto onError;
1415 if (!PyUnicode_Check(v)) {
1416 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001417 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001418 Py_TYPE(v)->tp_name);
1419 Py_DECREF(v);
1420 goto onError;
1421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001423
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 onError:
1425 return NULL;
1426}
1427
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001428PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1429 const char *errors)
1430{
1431 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001432 if (v)
1433 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001434 if (errors != NULL)
1435 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001436 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001437 PyUnicode_GET_SIZE(unicode),
1438 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001439 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001440 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001441 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001442 return v;
1443}
1444
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001445PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001446PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001447 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001448 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1449}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001450
Christian Heimes5894ba72007-11-04 11:43:14 +00001451PyObject*
1452PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1453{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001454 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1455 can be undefined. If it is case, decode using UTF-8. The following assumes
1456 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1457 bootstrapping process where the codecs aren't ready yet.
1458 */
1459 if (Py_FileSystemDefaultEncoding) {
1460#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001461 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001462 return PyUnicode_DecodeMBCS(s, size, "replace");
1463 }
1464#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001465 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001466 return PyUnicode_DecodeUTF8(s, size, "replace");
1467 }
1468#endif
1469 return PyUnicode_Decode(s, size,
1470 Py_FileSystemDefaultEncoding,
1471 "replace");
1472 }
1473 else {
1474 return PyUnicode_DecodeUTF8(s, size, "replace");
1475 }
1476}
1477
Martin v. Löwis5b222132007-06-10 09:51:05 +00001478char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001479_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001480{
Christian Heimesf3863112007-11-22 07:46:41 +00001481 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001482 if (!PyUnicode_Check(unicode)) {
1483 PyErr_BadArgument();
1484 return NULL;
1485 }
Christian Heimesf3863112007-11-22 07:46:41 +00001486 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1487 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001488 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001489 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001490 *psize = PyBytes_GET_SIZE(bytes);
1491 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001492}
1493
1494char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001495_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001496{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001497 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001498}
1499
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1501{
1502 if (!PyUnicode_Check(unicode)) {
1503 PyErr_BadArgument();
1504 goto onError;
1505 }
1506 return PyUnicode_AS_UNICODE(unicode);
1507
1508 onError:
1509 return NULL;
1510}
1511
Martin v. Löwis18e16552006-02-15 17:27:45 +00001512Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513{
1514 if (!PyUnicode_Check(unicode)) {
1515 PyErr_BadArgument();
1516 goto onError;
1517 }
1518 return PyUnicode_GET_SIZE(unicode);
1519
1520 onError:
1521 return -1;
1522}
1523
Thomas Wouters78890102000-07-22 19:25:51 +00001524const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001525{
1526 return unicode_default_encoding;
1527}
1528
1529int PyUnicode_SetDefaultEncoding(const char *encoding)
1530{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001531 if (strcmp(encoding, unicode_default_encoding) != 0) {
1532 PyErr_Format(PyExc_ValueError,
1533 "Can only set default encoding to %s",
1534 unicode_default_encoding);
1535 return -1;
1536 }
Fred Drakee4315f52000-05-09 19:53:39 +00001537 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001538}
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540/* error handling callback helper:
1541 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001542 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001543 and adjust various state variables.
1544 return 0 on success, -1 on error
1545*/
1546
1547static
1548int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1549 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001550 const char **input, const char **inend, Py_ssize_t *startinpos,
1551 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001552 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001554 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555
1556 PyObject *restuple = NULL;
1557 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001558 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001559 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t requiredsize;
1561 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001562 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001563 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001564 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001565 int res = -1;
1566
1567 if (*errorHandler == NULL) {
1568 *errorHandler = PyCodec_LookupError(errors);
1569 if (*errorHandler == NULL)
1570 goto onError;
1571 }
1572
1573 if (*exceptionObject == NULL) {
1574 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001575 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001576 if (*exceptionObject == NULL)
1577 goto onError;
1578 }
1579 else {
1580 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1581 goto onError;
1582 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1583 goto onError;
1584 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1585 goto onError;
1586 }
1587
1588 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1589 if (restuple == NULL)
1590 goto onError;
1591 if (!PyTuple_Check(restuple)) {
1592 PyErr_Format(PyExc_TypeError, &argparse[4]);
1593 goto onError;
1594 }
1595 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1596 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001597
1598 /* Copy back the bytes variables, which might have been modified by the
1599 callback */
1600 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1601 if (!inputobj)
1602 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001603 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001604 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1605 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001606 *input = PyBytes_AS_STRING(inputobj);
1607 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001608 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001609 /* we can DECREF safely, as the exception has another reference,
1610 so the object won't go away. */
1611 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001614 newpos = insize+newpos;
1615 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001616 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001617 goto onError;
1618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619
1620 /* need more space? (at least enough for what we
1621 have+the replacement+the rest of the string (starting
1622 at the new input position), so we won't have to check space
1623 when there are no errors in the rest of the string) */
1624 repptr = PyUnicode_AS_UNICODE(repunicode);
1625 repsize = PyUnicode_GET_SIZE(repunicode);
1626 requiredsize = *outpos + repsize + insize-newpos;
1627 if (requiredsize > outsize) {
1628 if (requiredsize<2*outsize)
1629 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001630 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001631 goto onError;
1632 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1633 }
1634 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001635 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001636 Py_UNICODE_COPY(*outptr, repptr, repsize);
1637 *outptr += repsize;
1638 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 /* we made it! */
1641 res = 0;
1642
1643 onError:
1644 Py_XDECREF(restuple);
1645 return res;
1646}
1647
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648/* --- UTF-7 Codec -------------------------------------------------------- */
1649
1650/* see RFC2152 for details */
1651
Tim Petersced69f82003-09-16 20:30:58 +00001652static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653char utf7_special[128] = {
1654 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1655 encoded:
1656 0 - not special
1657 1 - special
1658 2 - whitespace (optional)
1659 3 - RFC2152 Set O (optional) */
1660 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1661 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1662 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1664 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1666 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1668
1669};
1670
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001671/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1672 warnings about the comparison always being false; since
1673 utf7_special[0] is 1, we can safely make that one comparison
1674 true */
1675
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001677 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001678 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 (encodeO && (utf7_special[(c)] == 3)))
1680
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001681#define B64(n) \
1682 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1683#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001684 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001685#define UB64(c) \
1686 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1687 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001688
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001689#define ENCODE(out, ch, bits) \
1690 while (bits >= 6) { \
1691 *out++ = B64(ch >> (bits-6)); \
1692 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001693 }
1694
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001695#define DECODE(out, ch, bits, surrogate) \
1696 while (bits >= 16) { \
1697 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1698 bits -= 16; \
1699 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001700 /* We have already generated an error for the high surrogate \
1701 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001702 surrogate = 0; \
1703 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001704 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001705 it in a 16-bit character */ \
1706 surrogate = 1; \
1707 errmsg = "code pairs are not supported"; \
1708 goto utf7Error; \
1709 } else { \
1710 *out++ = outCh; \
1711 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001712 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001713
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001715 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716 const char *errors)
1717{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001718 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1719}
1720
1721PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1722 Py_ssize_t size,
1723 const char *errors,
1724 Py_ssize_t *consumed)
1725{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001726 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001727 Py_ssize_t startinpos;
1728 Py_ssize_t endinpos;
1729 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 const char *e;
1731 PyUnicodeObject *unicode;
1732 Py_UNICODE *p;
1733 const char *errmsg = "";
1734 int inShift = 0;
1735 unsigned int bitsleft = 0;
1736 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 int surrogate = 0;
1738 PyObject *errorHandler = NULL;
1739 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740
1741 unicode = _PyUnicode_New(size);
1742 if (!unicode)
1743 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001744 if (size == 0) {
1745 if (consumed)
1746 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001747 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001748 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749
1750 p = unicode->str;
1751 e = s + size;
1752
1753 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 Py_UNICODE ch;
1755 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001756 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757
1758 if (inShift) {
1759 if ((ch == '-') || !B64CHAR(ch)) {
1760 inShift = 0;
1761 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001762
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001763 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1764 if (bitsleft >= 6) {
1765 /* The shift sequence has a partial character in it. If
1766 bitsleft < 6 then we could just classify it as padding
1767 but that is not the case here */
1768
1769 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001770 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001771 }
1772 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001773 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001774 here so indicate the potential of a misencoded character. */
1775
1776 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1777 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1778 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001779 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001780 }
1781
1782 if (ch == '-') {
1783 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001784 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001785 inShift = 1;
1786 }
1787 } else if (SPECIAL(ch,0,0)) {
1788 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001789 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001790 } else {
1791 *p++ = ch;
1792 }
1793 } else {
1794 charsleft = (charsleft << 6) | UB64(ch);
1795 bitsleft += 6;
1796 s++;
1797 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1798 }
1799 }
1800 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001802 s++;
1803 if (s < e && *s == '-') {
1804 s++;
1805 *p++ = '+';
1806 } else
1807 {
1808 inShift = 1;
1809 bitsleft = 0;
1810 }
1811 }
1812 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001813 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001814 errmsg = "unexpected special character";
1815 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001816 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001817 }
1818 else {
1819 *p++ = ch;
1820 s++;
1821 }
1822 continue;
1823 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 outpos = p-PyUnicode_AS_UNICODE(unicode);
1825 endinpos = s-starts;
1826 if (unicode_decode_call_errorhandler(
1827 errors, &errorHandler,
1828 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001829 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 (PyObject **)&unicode, &outpos, &p))
1831 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001832 }
1833
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001834 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 outpos = p-PyUnicode_AS_UNICODE(unicode);
1836 endinpos = size;
1837 if (unicode_decode_call_errorhandler(
1838 errors, &errorHandler,
1839 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001840 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001842 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 if (s < e)
1844 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001845 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001846 if (consumed) {
1847 if(inShift)
1848 *consumed = startinpos;
1849 else
1850 *consumed = s-starts;
1851 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001852
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001853 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854 goto onError;
1855
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001856 Py_XDECREF(errorHandler);
1857 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001858 return (PyObject *)unicode;
1859
1860onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 Py_XDECREF(errorHandler);
1862 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001863 Py_DECREF(unicode);
1864 return NULL;
1865}
1866
1867
1868PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001869 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001870 int encodeSetO,
1871 int encodeWhiteSpace,
1872 const char *errors)
1873{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001874 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001875 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001876 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001877 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001878 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 unsigned int bitsleft = 0;
1880 unsigned long charsleft = 0;
1881 char * out;
1882 char * start;
1883
1884 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001885 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001886
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001887 if (cbAllocated / 5 != size)
1888 return PyErr_NoMemory();
1889
Christian Heimes9c4756e2008-05-26 13:22:05 +00001890 v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001891 if (v == NULL)
1892 return NULL;
1893
Christian Heimes9c4756e2008-05-26 13:22:05 +00001894 start = out = PyByteArray_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001895 for (;i < size; ++i) {
1896 Py_UNICODE ch = s[i];
1897
1898 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001899 if (ch == '+') {
1900 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001901 *out++ = '-';
1902 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1903 charsleft = ch;
1904 bitsleft = 16;
1905 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001906 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001907 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001908 } else {
1909 *out++ = (char) ch;
1910 }
1911 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001912 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1913 *out++ = B64(charsleft << (6-bitsleft));
1914 charsleft = 0;
1915 bitsleft = 0;
1916 /* Characters not in the BASE64 set implicitly unshift the sequence
1917 so no '-' is required, except if the character is itself a '-' */
1918 if (B64CHAR(ch) || ch == '-') {
1919 *out++ = '-';
1920 }
1921 inShift = 0;
1922 *out++ = (char) ch;
1923 } else {
1924 bitsleft += 16;
1925 charsleft = (charsleft << 16) | ch;
1926 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1927
1928 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001929 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001930 or '-' then the shift sequence will be terminated implicitly and we
1931 don't have to insert a '-'. */
1932
1933 if (bitsleft == 0) {
1934 if (i + 1 < size) {
1935 Py_UNICODE ch2 = s[i+1];
1936
1937 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001938
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939 } else if (B64CHAR(ch2) || ch2 == '-') {
1940 *out++ = '-';
1941 inShift = 0;
1942 } else {
1943 inShift = 0;
1944 }
1945
1946 }
1947 else {
1948 *out++ = '-';
1949 inShift = 0;
1950 }
1951 }
Tim Petersced69f82003-09-16 20:30:58 +00001952 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001953 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001954 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955 if (bitsleft) {
1956 *out++= B64(charsleft << (6-bitsleft) );
1957 *out++ = '-';
1958 }
1959
Christian Heimes72b710a2008-05-26 13:28:38 +00001960 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001961 Py_DECREF(v);
1962 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001963}
1964
1965#undef SPECIAL
1966#undef B64
1967#undef B64CHAR
1968#undef UB64
1969#undef ENCODE
1970#undef DECODE
1971
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972/* --- UTF-8 Codec -------------------------------------------------------- */
1973
Tim Petersced69f82003-09-16 20:30:58 +00001974static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975char utf8_code_length[256] = {
1976 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1977 illegal prefix. see RFC 2279 for details */
1978 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1979 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1980 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1987 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1991 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1992 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1993 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1994};
1995
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001997 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 const char *errors)
1999{
Walter Dörwald69652032004-09-07 20:24:22 +00002000 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2001}
2002
2003PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002004 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002005 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002006 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002010 Py_ssize_t startinpos;
2011 Py_ssize_t endinpos;
2012 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 const char *e;
2014 PyUnicodeObject *unicode;
2015 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002016 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 PyObject *errorHandler = NULL;
2018 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019
2020 /* Note: size will always be longer than the resulting Unicode
2021 character count */
2022 unicode = _PyUnicode_New(size);
2023 if (!unicode)
2024 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002025 if (size == 0) {
2026 if (consumed)
2027 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030
2031 /* Unpack UTF-8 encoded data */
2032 p = unicode->str;
2033 e = s + size;
2034
2035 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002036 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037
2038 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002039 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 s++;
2041 continue;
2042 }
2043
2044 n = utf8_code_length[ch];
2045
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002046 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002047 if (consumed)
2048 break;
2049 else {
2050 errmsg = "unexpected end of data";
2051 startinpos = s-starts;
2052 endinpos = size;
2053 goto utf8Error;
2054 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 switch (n) {
2058
2059 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002060 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 startinpos = s-starts;
2062 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002063 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064
2065 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002066 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 startinpos = s-starts;
2068 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002069 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070
2071 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002072 if ((s[1] & 0xc0) != 0x80) {
2073 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 startinpos = s-starts;
2075 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002076 goto utf8Error;
2077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002079 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002080 startinpos = s-starts;
2081 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002082 errmsg = "illegal encoding";
2083 goto utf8Error;
2084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002086 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 break;
2088
2089 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002090 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002091 (s[2] & 0xc0) != 0x80) {
2092 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 startinpos = s-starts;
2094 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002095 goto utf8Error;
2096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002098 if (ch < 0x0800) {
2099 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002100 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002101
2102 XXX For wide builds (UCS-4) we should probably try
2103 to recombine the surrogates into a single code
2104 unit.
2105 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002106 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 startinpos = s-starts;
2108 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002109 goto utf8Error;
2110 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002112 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002113 break;
2114
2115 case 4:
2116 if ((s[1] & 0xc0) != 0x80 ||
2117 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002118 (s[3] & 0xc0) != 0x80) {
2119 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002120 startinpos = s-starts;
2121 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002122 goto utf8Error;
2123 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002124 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2125 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2126 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002127 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002128 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002129 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002130 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002131 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002132 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 startinpos = s-starts;
2134 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002135 goto utf8Error;
2136 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002137#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002138 *p++ = (Py_UNICODE)ch;
2139#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002140 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002141
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002142 /* translate from 10000..10FFFF to 0..FFFF */
2143 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002144
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002145 /* high surrogate = top 10 bits added to D800 */
2146 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002147
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002148 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002149 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002150#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151 break;
2152
2153 default:
2154 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002155 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002156 startinpos = s-starts;
2157 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002158 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 }
2160 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002161 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002162
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002163 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002164 outpos = p-PyUnicode_AS_UNICODE(unicode);
2165 if (unicode_decode_call_errorhandler(
2166 errors, &errorHandler,
2167 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002168 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002169 (PyObject **)&unicode, &outpos, &p))
2170 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 }
Walter Dörwald69652032004-09-07 20:24:22 +00002172 if (consumed)
2173 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174
2175 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002176 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 goto onError;
2178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002179 Py_XDECREF(errorHandler);
2180 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 return (PyObject *)unicode;
2182
2183onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002184 Py_XDECREF(errorHandler);
2185 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 Py_DECREF(unicode);
2187 return NULL;
2188}
2189
Tim Peters602f7402002-04-27 18:03:26 +00002190/* Allocation strategy: if the string is short, convert into a stack buffer
2191 and allocate exactly as much space needed at the end. Else allocate the
2192 maximum possible needed (4 result bytes per Unicode character), and return
2193 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002194*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002195PyObject *
2196PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002197 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002198 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199{
Tim Peters602f7402002-04-27 18:03:26 +00002200#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002201
Guido van Rossum98297ee2007-11-06 21:34:58 +00002202 Py_ssize_t i; /* index into s of next input byte */
2203 PyObject *result; /* result string object */
2204 char *p; /* next free byte in output buffer */
2205 Py_ssize_t nallocated; /* number of result bytes allocated */
2206 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002207 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002208
Tim Peters602f7402002-04-27 18:03:26 +00002209 assert(s != NULL);
2210 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211
Tim Peters602f7402002-04-27 18:03:26 +00002212 if (size <= MAX_SHORT_UNICHARS) {
2213 /* Write into the stack buffer; nallocated can't overflow.
2214 * At the end, we'll allocate exactly as much heap space as it
2215 * turns out we need.
2216 */
2217 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002218 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002219 p = stackbuf;
2220 }
2221 else {
2222 /* Overallocate on the heap, and give the excess back at the end. */
2223 nallocated = size * 4;
2224 if (nallocated / 4 != size) /* overflow! */
2225 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002226 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002227 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002228 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002229 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002230 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002231
Tim Peters602f7402002-04-27 18:03:26 +00002232 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002233 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002234
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002235 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002236 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002238
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002240 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002241 *p++ = (char)(0xc0 | (ch >> 6));
2242 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002243 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002244 else {
Tim Peters602f7402002-04-27 18:03:26 +00002245 /* Encode UCS2 Unicode ordinals */
2246 if (ch < 0x10000) {
2247 /* Special case: check for high surrogate */
2248 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2249 Py_UCS4 ch2 = s[i];
2250 /* Check for low surrogate and combine the two to
2251 form a UCS4 value */
2252 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002253 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002254 i++;
2255 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002256 }
Tim Peters602f7402002-04-27 18:03:26 +00002257 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002258 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002259 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002260 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2261 *p++ = (char)(0x80 | (ch & 0x3f));
2262 continue;
2263 }
2264encodeUCS4:
2265 /* Encode UCS4 Unicode ordinals */
2266 *p++ = (char)(0xf0 | (ch >> 18));
2267 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2268 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2269 *p++ = (char)(0x80 | (ch & 0x3f));
2270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002272
Guido van Rossum98297ee2007-11-06 21:34:58 +00002273 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002274 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002275 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002276 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002277 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002278 }
2279 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002280 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002281 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002282 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002283 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002284 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002285 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002286
Tim Peters602f7402002-04-27 18:03:26 +00002287#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288}
2289
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2291{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 if (!PyUnicode_Check(unicode)) {
2293 PyErr_BadArgument();
2294 return NULL;
2295 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002296 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2297 PyUnicode_GET_SIZE(unicode),
2298 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299}
2300
Walter Dörwald41980ca2007-08-16 21:55:45 +00002301/* --- UTF-32 Codec ------------------------------------------------------- */
2302
2303PyObject *
2304PyUnicode_DecodeUTF32(const char *s,
2305 Py_ssize_t size,
2306 const char *errors,
2307 int *byteorder)
2308{
2309 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2310}
2311
2312PyObject *
2313PyUnicode_DecodeUTF32Stateful(const char *s,
2314 Py_ssize_t size,
2315 const char *errors,
2316 int *byteorder,
2317 Py_ssize_t *consumed)
2318{
2319 const char *starts = s;
2320 Py_ssize_t startinpos;
2321 Py_ssize_t endinpos;
2322 Py_ssize_t outpos;
2323 PyUnicodeObject *unicode;
2324 Py_UNICODE *p;
2325#ifndef Py_UNICODE_WIDE
2326 int i, pairs;
2327#else
2328 const int pairs = 0;
2329#endif
2330 const unsigned char *q, *e;
2331 int bo = 0; /* assume native ordering by default */
2332 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002333 /* Offsets from q for retrieving bytes in the right order. */
2334#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2335 int iorder[] = {0, 1, 2, 3};
2336#else
2337 int iorder[] = {3, 2, 1, 0};
2338#endif
2339 PyObject *errorHandler = NULL;
2340 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002341 /* On narrow builds we split characters outside the BMP into two
2342 codepoints => count how much extra space we need. */
2343#ifndef Py_UNICODE_WIDE
2344 for (i = pairs = 0; i < size/4; i++)
2345 if (((Py_UCS4 *)s)[i] >= 0x10000)
2346 pairs++;
2347#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002348
2349 /* This might be one to much, because of a BOM */
2350 unicode = _PyUnicode_New((size+3)/4+pairs);
2351 if (!unicode)
2352 return NULL;
2353 if (size == 0)
2354 return (PyObject *)unicode;
2355
2356 /* Unpack UTF-32 encoded data */
2357 p = unicode->str;
2358 q = (unsigned char *)s;
2359 e = q + size;
2360
2361 if (byteorder)
2362 bo = *byteorder;
2363
2364 /* Check for BOM marks (U+FEFF) in the input and adjust current
2365 byte order setting accordingly. In native mode, the leading BOM
2366 mark is skipped, in all other modes, it is copied to the output
2367 stream as-is (giving a ZWNBSP character). */
2368 if (bo == 0) {
2369 if (size >= 4) {
2370 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2371 (q[iorder[1]] << 8) | q[iorder[0]];
2372#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2373 if (bom == 0x0000FEFF) {
2374 q += 4;
2375 bo = -1;
2376 }
2377 else if (bom == 0xFFFE0000) {
2378 q += 4;
2379 bo = 1;
2380 }
2381#else
2382 if (bom == 0x0000FEFF) {
2383 q += 4;
2384 bo = 1;
2385 }
2386 else if (bom == 0xFFFE0000) {
2387 q += 4;
2388 bo = -1;
2389 }
2390#endif
2391 }
2392 }
2393
2394 if (bo == -1) {
2395 /* force LE */
2396 iorder[0] = 0;
2397 iorder[1] = 1;
2398 iorder[2] = 2;
2399 iorder[3] = 3;
2400 }
2401 else if (bo == 1) {
2402 /* force BE */
2403 iorder[0] = 3;
2404 iorder[1] = 2;
2405 iorder[2] = 1;
2406 iorder[3] = 0;
2407 }
2408
2409 while (q < e) {
2410 Py_UCS4 ch;
2411 /* remaining bytes at the end? (size should be divisible by 4) */
2412 if (e-q<4) {
2413 if (consumed)
2414 break;
2415 errmsg = "truncated data";
2416 startinpos = ((const char *)q)-starts;
2417 endinpos = ((const char *)e)-starts;
2418 goto utf32Error;
2419 /* The remaining input chars are ignored if the callback
2420 chooses to skip the input */
2421 }
2422 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2423 (q[iorder[1]] << 8) | q[iorder[0]];
2424
2425 if (ch >= 0x110000)
2426 {
2427 errmsg = "codepoint not in range(0x110000)";
2428 startinpos = ((const char *)q)-starts;
2429 endinpos = startinpos+4;
2430 goto utf32Error;
2431 }
2432#ifndef Py_UNICODE_WIDE
2433 if (ch >= 0x10000)
2434 {
2435 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2436 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2437 }
2438 else
2439#endif
2440 *p++ = ch;
2441 q += 4;
2442 continue;
2443 utf32Error:
2444 outpos = p-PyUnicode_AS_UNICODE(unicode);
2445 if (unicode_decode_call_errorhandler(
2446 errors, &errorHandler,
2447 "utf32", errmsg,
2448 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2449 (PyObject **)&unicode, &outpos, &p))
2450 goto onError;
2451 }
2452
2453 if (byteorder)
2454 *byteorder = bo;
2455
2456 if (consumed)
2457 *consumed = (const char *)q-starts;
2458
2459 /* Adjust length */
2460 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2461 goto onError;
2462
2463 Py_XDECREF(errorHandler);
2464 Py_XDECREF(exc);
2465 return (PyObject *)unicode;
2466
2467onError:
2468 Py_DECREF(unicode);
2469 Py_XDECREF(errorHandler);
2470 Py_XDECREF(exc);
2471 return NULL;
2472}
2473
2474PyObject *
2475PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2476 Py_ssize_t size,
2477 const char *errors,
2478 int byteorder)
2479{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002480 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002481 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002482 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002483#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002484 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002485#else
2486 const int pairs = 0;
2487#endif
2488 /* Offsets from p for storing byte pairs in the right order. */
2489#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2490 int iorder[] = {0, 1, 2, 3};
2491#else
2492 int iorder[] = {3, 2, 1, 0};
2493#endif
2494
2495#define STORECHAR(CH) \
2496 do { \
2497 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2498 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2499 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2500 p[iorder[0]] = (CH) & 0xff; \
2501 p += 4; \
2502 } while(0)
2503
2504 /* In narrow builds we can output surrogate pairs as one codepoint,
2505 so we need less space. */
2506#ifndef Py_UNICODE_WIDE
2507 for (i = pairs = 0; i < size-1; i++)
2508 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2509 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2510 pairs++;
2511#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002512 nsize = (size - pairs + (byteorder == 0));
2513 bytesize = nsize * 4;
2514 if (bytesize / 4 != nsize)
2515 return PyErr_NoMemory();
2516 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002517 if (v == NULL)
2518 return NULL;
2519
Christian Heimes9c4756e2008-05-26 13:22:05 +00002520 p = (unsigned char *)PyByteArray_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002521 if (byteorder == 0)
2522 STORECHAR(0xFEFF);
2523 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002524 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002525
2526 if (byteorder == -1) {
2527 /* force LE */
2528 iorder[0] = 0;
2529 iorder[1] = 1;
2530 iorder[2] = 2;
2531 iorder[3] = 3;
2532 }
2533 else if (byteorder == 1) {
2534 /* force BE */
2535 iorder[0] = 3;
2536 iorder[1] = 2;
2537 iorder[2] = 1;
2538 iorder[3] = 0;
2539 }
2540
2541 while (size-- > 0) {
2542 Py_UCS4 ch = *s++;
2543#ifndef Py_UNICODE_WIDE
2544 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2545 Py_UCS4 ch2 = *s;
2546 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2547 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2548 s++;
2549 size--;
2550 }
2551 }
2552#endif
2553 STORECHAR(ch);
2554 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002555
2556 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002557 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002558 Py_DECREF(v);
2559 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002560#undef STORECHAR
2561}
2562
2563PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2564{
2565 if (!PyUnicode_Check(unicode)) {
2566 PyErr_BadArgument();
2567 return NULL;
2568 }
2569 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2570 PyUnicode_GET_SIZE(unicode),
2571 NULL,
2572 0);
2573}
2574
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575/* --- UTF-16 Codec ------------------------------------------------------- */
2576
Tim Peters772747b2001-08-09 22:21:55 +00002577PyObject *
2578PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002579 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002580 const char *errors,
2581 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582{
Walter Dörwald69652032004-09-07 20:24:22 +00002583 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2584}
2585
2586PyObject *
2587PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002588 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002589 const char *errors,
2590 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002591 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002592{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002593 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002594 Py_ssize_t startinpos;
2595 Py_ssize_t endinpos;
2596 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 PyUnicodeObject *unicode;
2598 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002599 const unsigned char *q, *e;
2600 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002601 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002602 /* Offsets from q for retrieving byte pairs in the right order. */
2603#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2604 int ihi = 1, ilo = 0;
2605#else
2606 int ihi = 0, ilo = 1;
2607#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 PyObject *errorHandler = NULL;
2609 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 /* Note: size will always be longer than the resulting Unicode
2612 character count */
2613 unicode = _PyUnicode_New(size);
2614 if (!unicode)
2615 return NULL;
2616 if (size == 0)
2617 return (PyObject *)unicode;
2618
2619 /* Unpack UTF-16 encoded data */
2620 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002621 q = (unsigned char *)s;
2622 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623
2624 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002625 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002627 /* Check for BOM marks (U+FEFF) in the input and adjust current
2628 byte order setting accordingly. In native mode, the leading BOM
2629 mark is skipped, in all other modes, it is copied to the output
2630 stream as-is (giving a ZWNBSP character). */
2631 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002632 if (size >= 2) {
2633 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002634#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002635 if (bom == 0xFEFF) {
2636 q += 2;
2637 bo = -1;
2638 }
2639 else if (bom == 0xFFFE) {
2640 q += 2;
2641 bo = 1;
2642 }
Tim Petersced69f82003-09-16 20:30:58 +00002643#else
Walter Dörwald69652032004-09-07 20:24:22 +00002644 if (bom == 0xFEFF) {
2645 q += 2;
2646 bo = 1;
2647 }
2648 else if (bom == 0xFFFE) {
2649 q += 2;
2650 bo = -1;
2651 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002652#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002653 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655
Tim Peters772747b2001-08-09 22:21:55 +00002656 if (bo == -1) {
2657 /* force LE */
2658 ihi = 1;
2659 ilo = 0;
2660 }
2661 else if (bo == 1) {
2662 /* force BE */
2663 ihi = 0;
2664 ilo = 1;
2665 }
2666
2667 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002669 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002671 if (consumed)
2672 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002673 errmsg = "truncated data";
2674 startinpos = ((const char *)q)-starts;
2675 endinpos = ((const char *)e)-starts;
2676 goto utf16Error;
2677 /* The remaining input chars are ignored if the callback
2678 chooses to skip the input */
2679 }
2680 ch = (q[ihi] << 8) | q[ilo];
2681
Tim Peters772747b2001-08-09 22:21:55 +00002682 q += 2;
2683
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 if (ch < 0xD800 || ch > 0xDFFF) {
2685 *p++ = ch;
2686 continue;
2687 }
2688
2689 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002690 if (q >= e) {
2691 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002692 startinpos = (((const char *)q)-2)-starts;
2693 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002694 goto utf16Error;
2695 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002696 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002697 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2698 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002699 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002700#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002701 *p++ = ch;
2702 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002703#else
2704 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002705#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002706 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002707 }
2708 else {
2709 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 startinpos = (((const char *)q)-4)-starts;
2711 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002712 goto utf16Error;
2713 }
2714
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002716 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 startinpos = (((const char *)q)-2)-starts;
2718 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002719 /* Fall through to report the error */
2720
2721 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 outpos = p-PyUnicode_AS_UNICODE(unicode);
2723 if (unicode_decode_call_errorhandler(
2724 errors, &errorHandler,
2725 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002726 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002728 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 }
2730
2731 if (byteorder)
2732 *byteorder = bo;
2733
Walter Dörwald69652032004-09-07 20:24:22 +00002734 if (consumed)
2735 *consumed = (const char *)q-starts;
2736
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002738 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 goto onError;
2740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 Py_XDECREF(errorHandler);
2742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 return (PyObject *)unicode;
2744
2745onError:
2746 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 Py_XDECREF(errorHandler);
2748 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 return NULL;
2750}
2751
Tim Peters772747b2001-08-09 22:21:55 +00002752PyObject *
2753PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002754 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002755 const char *errors,
2756 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002758 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002759 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002760 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002761#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002762 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002763#else
2764 const int pairs = 0;
2765#endif
Tim Peters772747b2001-08-09 22:21:55 +00002766 /* Offsets from p for storing byte pairs in the right order. */
2767#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2768 int ihi = 1, ilo = 0;
2769#else
2770 int ihi = 0, ilo = 1;
2771#endif
2772
2773#define STORECHAR(CH) \
2774 do { \
2775 p[ihi] = ((CH) >> 8) & 0xff; \
2776 p[ilo] = (CH) & 0xff; \
2777 p += 2; \
2778 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002780#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002781 for (i = pairs = 0; i < size; i++)
2782 if (s[i] >= 0x10000)
2783 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002784#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002785 /* 2 * (size + pairs + (byteorder == 0)) */
2786 if (size > PY_SSIZE_T_MAX ||
2787 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2788 return PyErr_NoMemory();
2789 nsize = size + pairs + (byteorder == 0);
2790 bytesize = nsize * 2;
2791 if (bytesize / 2 != nsize)
2792 return PyErr_NoMemory();
2793 v = PyByteArray_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 if (v == NULL)
2795 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796
Christian Heimes9c4756e2008-05-26 13:22:05 +00002797 p = (unsigned char *)PyByteArray_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002799 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002800 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002801 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002802
2803 if (byteorder == -1) {
2804 /* force LE */
2805 ihi = 1;
2806 ilo = 0;
2807 }
2808 else if (byteorder == 1) {
2809 /* force BE */
2810 ihi = 0;
2811 ilo = 1;
2812 }
2813
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002814 while (size-- > 0) {
2815 Py_UNICODE ch = *s++;
2816 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002817#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002818 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002819 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2820 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002822#endif
Tim Peters772747b2001-08-09 22:21:55 +00002823 STORECHAR(ch);
2824 if (ch2)
2825 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002826 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002827
2828 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00002829 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
Guido van Rossum98297ee2007-11-06 21:34:58 +00002830 Py_DECREF(v);
2831 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002832#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833}
2834
2835PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2836{
2837 if (!PyUnicode_Check(unicode)) {
2838 PyErr_BadArgument();
2839 return NULL;
2840 }
2841 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2842 PyUnicode_GET_SIZE(unicode),
2843 NULL,
2844 0);
2845}
2846
2847/* --- Unicode Escape Codec ----------------------------------------------- */
2848
Fredrik Lundh06d12682001-01-24 07:59:11 +00002849static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002850
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 const char *errors)
2854{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002856 Py_ssize_t startinpos;
2857 Py_ssize_t endinpos;
2858 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002863 char* message;
2864 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 PyObject *errorHandler = NULL;
2866 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002867
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 /* Escaped strings will always be longer than the resulting
2869 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 length after conversion to the true value.
2871 (but if the error callback returns a long replacement string
2872 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 v = _PyUnicode_New(size);
2874 if (v == NULL)
2875 goto onError;
2876 if (size == 0)
2877 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002878
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002881
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 while (s < end) {
2883 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002884 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002885 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886
2887 /* Non-escape characters are interpreted as Unicode ordinals */
2888 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002889 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890 continue;
2891 }
2892
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002893 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 /* \ - Escapes */
2895 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002896 c = *s++;
2897 if (s > end)
2898 c = '\0'; /* Invalid after \ */
2899 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900
2901 /* \x escapes */
2902 case '\n': break;
2903 case '\\': *p++ = '\\'; break;
2904 case '\'': *p++ = '\''; break;
2905 case '\"': *p++ = '\"'; break;
2906 case 'b': *p++ = '\b'; break;
2907 case 'f': *p++ = '\014'; break; /* FF */
2908 case 't': *p++ = '\t'; break;
2909 case 'n': *p++ = '\n'; break;
2910 case 'r': *p++ = '\r'; break;
2911 case 'v': *p++ = '\013'; break; /* VT */
2912 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2913
2914 /* \OOO (octal) escapes */
2915 case '0': case '1': case '2': case '3':
2916 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002917 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002918 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002919 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002920 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002921 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002923 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 break;
2925
Fredrik Lundhccc74732001-02-18 22:13:49 +00002926 /* hex escapes */
2927 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002929 digits = 2;
2930 message = "truncated \\xXX escape";
2931 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932
Fredrik Lundhccc74732001-02-18 22:13:49 +00002933 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002935 digits = 4;
2936 message = "truncated \\uXXXX escape";
2937 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938
Fredrik Lundhccc74732001-02-18 22:13:49 +00002939 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002940 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002941 digits = 8;
2942 message = "truncated \\UXXXXXXXX escape";
2943 hexescape:
2944 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002945 outpos = p-PyUnicode_AS_UNICODE(v);
2946 if (s+digits>end) {
2947 endinpos = size;
2948 if (unicode_decode_call_errorhandler(
2949 errors, &errorHandler,
2950 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002951 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 (PyObject **)&v, &outpos, &p))
2953 goto onError;
2954 goto nextByte;
2955 }
2956 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002957 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002958 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 endinpos = (s+i+1)-starts;
2960 if (unicode_decode_call_errorhandler(
2961 errors, &errorHandler,
2962 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002964 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002967 }
2968 chr = (chr<<4) & ~0xF;
2969 if (c >= '0' && c <= '9')
2970 chr += c - '0';
2971 else if (c >= 'a' && c <= 'f')
2972 chr += 10 + c - 'a';
2973 else
2974 chr += 10 + c - 'A';
2975 }
2976 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002977 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002978 /* _decoding_error will have already written into the
2979 target buffer. */
2980 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002981 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002982 /* when we get here, chr is a 32-bit unicode character */
2983 if (chr <= 0xffff)
2984 /* UCS-2 character */
2985 *p++ = (Py_UNICODE) chr;
2986 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002987 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002988 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002989#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002990 *p++ = chr;
2991#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002992 chr -= 0x10000L;
2993 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002994 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002995#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002996 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997 endinpos = s-starts;
2998 outpos = p-PyUnicode_AS_UNICODE(v);
2999 if (unicode_decode_call_errorhandler(
3000 errors, &errorHandler,
3001 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003002 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003004 goto onError;
3005 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003006 break;
3007
3008 /* \N{name} */
3009 case 'N':
3010 message = "malformed \\N character escape";
3011 if (ucnhash_CAPI == NULL) {
3012 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003013 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003014 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003015 if (m == NULL)
3016 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003017 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003018 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003019 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003020 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003021 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003022 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003023 if (ucnhash_CAPI == NULL)
3024 goto ucnhashError;
3025 }
3026 if (*s == '{') {
3027 const char *start = s+1;
3028 /* look for the closing brace */
3029 while (*s != '}' && s < end)
3030 s++;
3031 if (s > start && s < end && *s == '}') {
3032 /* found a name. look it up in the unicode database */
3033 message = "unknown Unicode character name";
3034 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003035 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003036 goto store;
3037 }
3038 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003039 endinpos = s-starts;
3040 outpos = p-PyUnicode_AS_UNICODE(v);
3041 if (unicode_decode_call_errorhandler(
3042 errors, &errorHandler,
3043 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003044 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003045 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003046 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003047 break;
3048
3049 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003050 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 message = "\\ at end of string";
3052 s--;
3053 endinpos = s-starts;
3054 outpos = p-PyUnicode_AS_UNICODE(v);
3055 if (unicode_decode_call_errorhandler(
3056 errors, &errorHandler,
3057 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003058 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003060 goto onError;
3061 }
3062 else {
3063 *p++ = '\\';
3064 *p++ = (unsigned char)s[-1];
3065 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003066 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 nextByte:
3069 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003071 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003073 Py_XDECREF(errorHandler);
3074 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003076
Fredrik Lundhccc74732001-02-18 22:13:49 +00003077ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003078 PyErr_SetString(
3079 PyExc_UnicodeError,
3080 "\\N escapes not supported (can't load unicodedata module)"
3081 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003082 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 Py_XDECREF(errorHandler);
3084 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003085 return NULL;
3086
Fredrik Lundhccc74732001-02-18 22:13:49 +00003087onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 Py_XDECREF(errorHandler);
3090 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 return NULL;
3092}
3093
3094/* Return a Unicode-Escape string version of the Unicode object.
3095
3096 If quotes is true, the string is enclosed in u"" or u'' quotes as
3097 appropriate.
3098
3099*/
3100
Thomas Wouters477c8d52006-05-27 19:21:47 +00003101Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3102 Py_ssize_t size,
3103 Py_UNICODE ch)
3104{
3105 /* like wcschr, but doesn't stop at NULL characters */
3106
3107 while (size-- > 0) {
3108 if (*s == ch)
3109 return s;
3110 s++;
3111 }
3112
3113 return NULL;
3114}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003115
Walter Dörwald79e913e2007-05-12 11:08:06 +00003116static const char *hexdigits = "0123456789abcdef";
3117
3118PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3119 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003121 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003124#ifdef Py_UNICODE_WIDE
3125 const Py_ssize_t expandsize = 10;
3126#else
3127 const Py_ssize_t expandsize = 6;
3128#endif
3129
Thomas Wouters89f507f2006-12-13 04:49:30 +00003130 /* XXX(nnorwitz): rather than over-allocating, it would be
3131 better to choose a different scheme. Perhaps scan the
3132 first N-chars of the string and allocate based on that size.
3133 */
3134 /* Initial allocation is based on the longest-possible unichr
3135 escape.
3136
3137 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3138 unichr, so in this case it's the longest unichr escape. In
3139 narrow (UTF-16) builds this is five chars per source unichr
3140 since there are two unichrs in the surrogate pair, so in narrow
3141 (UTF-16) builds it's not the longest unichr escape.
3142
3143 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3144 so in the narrow (UTF-16) build case it's the longest unichr
3145 escape.
3146 */
3147
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003148 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3149 return PyErr_NoMemory();
3150
Christian Heimes9c4756e2008-05-26 13:22:05 +00003151 repr = PyByteArray_FromStringAndSize(NULL,
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003152 2
3153 + expandsize*size
Thomas Wouters89f507f2006-12-13 04:49:30 +00003154 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 if (repr == NULL)
3156 return NULL;
3157
Christian Heimes9c4756e2008-05-26 13:22:05 +00003158 p = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 while (size-- > 0) {
3161 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003162
Walter Dörwald79e913e2007-05-12 11:08:06 +00003163 /* Escape backslashes */
3164 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 *p++ = '\\';
3166 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003167 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003168 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003169
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003170#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003171 /* Map 21-bit characters to '\U00xxxxxx' */
3172 else if (ch >= 0x10000) {
3173 *p++ = '\\';
3174 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003175 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3176 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3177 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3178 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3179 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3180 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3181 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3182 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003183 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003184 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003185#else
3186 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003187 else if (ch >= 0xD800 && ch < 0xDC00) {
3188 Py_UNICODE ch2;
3189 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003190
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003191 ch2 = *s++;
3192 size--;
3193 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3194 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3195 *p++ = '\\';
3196 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003197 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3198 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3199 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3200 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3201 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3202 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3203 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3204 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003205 continue;
3206 }
3207 /* Fall through: isolated surrogates are copied as-is */
3208 s--;
3209 size++;
3210 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003211#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003212
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003214 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 *p++ = '\\';
3216 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003217 *p++ = hexdigits[(ch >> 12) & 0x000F];
3218 *p++ = hexdigits[(ch >> 8) & 0x000F];
3219 *p++ = hexdigits[(ch >> 4) & 0x000F];
3220 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003222
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003223 /* Map special whitespace to '\t', \n', '\r' */
3224 else if (ch == '\t') {
3225 *p++ = '\\';
3226 *p++ = 't';
3227 }
3228 else if (ch == '\n') {
3229 *p++ = '\\';
3230 *p++ = 'n';
3231 }
3232 else if (ch == '\r') {
3233 *p++ = '\\';
3234 *p++ = 'r';
3235 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003236
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003237 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003238 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003240 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003241 *p++ = hexdigits[(ch >> 4) & 0x000F];
3242 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003243 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003244
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 /* Copy everything else as-is */
3246 else
3247 *p++ = (char) ch;
3248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249
Christian Heimes72b710a2008-05-26 13:28:38 +00003250 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003251 p - PyByteArray_AS_STRING(repr));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003252 Py_DECREF(repr);
3253 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254}
3255
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3257{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003258 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 if (!PyUnicode_Check(unicode)) {
3260 PyErr_BadArgument();
3261 return NULL;
3262 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003263 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3264 PyUnicode_GET_SIZE(unicode));
3265
3266 if (!s)
3267 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003268 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003269 PyByteArray_GET_SIZE(s));
Walter Dörwald79e913e2007-05-12 11:08:06 +00003270 Py_DECREF(s);
3271 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272}
3273
3274/* --- Raw Unicode Escape Codec ------------------------------------------- */
3275
3276PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003277 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 const char *errors)
3279{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003281 Py_ssize_t startinpos;
3282 Py_ssize_t endinpos;
3283 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 const char *end;
3287 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 PyObject *errorHandler = NULL;
3289 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003290
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 /* Escaped strings will always be longer than the resulting
3292 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 length after conversion to the true value. (But decoding error
3294 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 v = _PyUnicode_New(size);
3296 if (v == NULL)
3297 goto onError;
3298 if (size == 0)
3299 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 end = s + size;
3302 while (s < end) {
3303 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003304 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003306 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307
3308 /* Non-escape characters are interpreted as Unicode ordinals */
3309 if (*s != '\\') {
3310 *p++ = (unsigned char)*s++;
3311 continue;
3312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314
3315 /* \u-escapes are only interpreted iff the number of leading
3316 backslashes if odd */
3317 bs = s;
3318 for (;s < end;) {
3319 if (*s != '\\')
3320 break;
3321 *p++ = (unsigned char)*s++;
3322 }
3323 if (((s - bs) & 1) == 0 ||
3324 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003325 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 continue;
3327 }
3328 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003329 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 s++;
3331
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003332 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003334 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003336 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 endinpos = s-starts;
3338 if (unicode_decode_call_errorhandler(
3339 errors, &errorHandler,
3340 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003341 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003344 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 }
3346 x = (x<<4) & ~0xF;
3347 if (c >= '0' && c <= '9')
3348 x += c - '0';
3349 else if (c >= 'a' && c <= 'f')
3350 x += 10 + c - 'a';
3351 else
3352 x += 10 + c - 'A';
3353 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003354 if (x <= 0xffff)
3355 /* UCS-2 character */
3356 *p++ = (Py_UNICODE) x;
3357 else if (x <= 0x10ffff) {
3358 /* UCS-4 character. Either store directly, or as
3359 surrogate pair. */
3360#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003361 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003362#else
3363 x -= 0x10000L;
3364 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3365 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3366#endif
3367 } else {
3368 endinpos = s-starts;
3369 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003370 if (unicode_decode_call_errorhandler(
3371 errors, &errorHandler,
3372 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003373 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003374 (PyObject **)&v, &outpos, &p))
3375 goto onError;
3376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377 nextByte:
3378 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003380 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003381 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382 Py_XDECREF(errorHandler);
3383 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003385
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386 onError:
3387 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 Py_XDECREF(errorHandler);
3389 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 return NULL;
3391}
3392
3393PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003394 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003396 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 char *p;
3398 char *q;
3399
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003400#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003401 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003402#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003403 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003404#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003405
3406 if (size > PY_SSIZE_T_MAX / expandsize)
3407 return PyErr_NoMemory();
3408
3409 repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 if (repr == NULL)
3411 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003412 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003413 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414
Christian Heimes9c4756e2008-05-26 13:22:05 +00003415 p = q = PyByteArray_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 while (size-- > 0) {
3417 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003418#ifdef Py_UNICODE_WIDE
3419 /* Map 32-bit characters to '\Uxxxxxxxx' */
3420 if (ch >= 0x10000) {
3421 *p++ = '\\';
3422 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003423 *p++ = hexdigits[(ch >> 28) & 0xf];
3424 *p++ = hexdigits[(ch >> 24) & 0xf];
3425 *p++ = hexdigits[(ch >> 20) & 0xf];
3426 *p++ = hexdigits[(ch >> 16) & 0xf];
3427 *p++ = hexdigits[(ch >> 12) & 0xf];
3428 *p++ = hexdigits[(ch >> 8) & 0xf];
3429 *p++ = hexdigits[(ch >> 4) & 0xf];
3430 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003431 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003432 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003433#else
3434 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3435 if (ch >= 0xD800 && ch < 0xDC00) {
3436 Py_UNICODE ch2;
3437 Py_UCS4 ucs;
3438
3439 ch2 = *s++;
3440 size--;
3441 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3442 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3443 *p++ = '\\';
3444 *p++ = 'U';
3445 *p++ = hexdigits[(ucs >> 28) & 0xf];
3446 *p++ = hexdigits[(ucs >> 24) & 0xf];
3447 *p++ = hexdigits[(ucs >> 20) & 0xf];
3448 *p++ = hexdigits[(ucs >> 16) & 0xf];
3449 *p++ = hexdigits[(ucs >> 12) & 0xf];
3450 *p++ = hexdigits[(ucs >> 8) & 0xf];
3451 *p++ = hexdigits[(ucs >> 4) & 0xf];
3452 *p++ = hexdigits[ucs & 0xf];
3453 continue;
3454 }
3455 /* Fall through: isolated surrogates are copied as-is */
3456 s--;
3457 size++;
3458 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003459#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 /* Map 16-bit characters to '\uxxxx' */
3461 if (ch >= 256) {
3462 *p++ = '\\';
3463 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003464 *p++ = hexdigits[(ch >> 12) & 0xf];
3465 *p++ = hexdigits[(ch >> 8) & 0xf];
3466 *p++ = hexdigits[(ch >> 4) & 0xf];
3467 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 }
3469 /* Copy everything else as-is */
3470 else
3471 *p++ = (char) ch;
3472 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003473 size = p - q;
3474
3475 done:
Christian Heimes72b710a2008-05-26 13:28:38 +00003476 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003477 Py_DECREF(repr);
3478 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479}
3480
3481PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3482{
Walter Dörwald711005d2007-05-12 12:03:26 +00003483 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003485 PyErr_BadArgument();
3486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003488 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3489 PyUnicode_GET_SIZE(unicode));
3490
3491 if (!s)
3492 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003493 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003494 PyByteArray_GET_SIZE(s));
Walter Dörwald711005d2007-05-12 12:03:26 +00003495 Py_DECREF(s);
3496 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497}
3498
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003499/* --- Unicode Internal Codec ------------------------------------------- */
3500
3501PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003502 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003503 const char *errors)
3504{
3505 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003506 Py_ssize_t startinpos;
3507 Py_ssize_t endinpos;
3508 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003509 PyUnicodeObject *v;
3510 Py_UNICODE *p;
3511 const char *end;
3512 const char *reason;
3513 PyObject *errorHandler = NULL;
3514 PyObject *exc = NULL;
3515
Neal Norwitzd43069c2006-01-08 01:12:10 +00003516#ifdef Py_UNICODE_WIDE
3517 Py_UNICODE unimax = PyUnicode_GetMax();
3518#endif
3519
Thomas Wouters89f507f2006-12-13 04:49:30 +00003520 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003521 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3522 if (v == NULL)
3523 goto onError;
3524 if (PyUnicode_GetSize((PyObject *)v) == 0)
3525 return (PyObject *)v;
3526 p = PyUnicode_AS_UNICODE(v);
3527 end = s + size;
3528
3529 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003530 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003531 /* We have to sanity check the raw data, otherwise doom looms for
3532 some malformed UCS-4 data. */
3533 if (
3534 #ifdef Py_UNICODE_WIDE
3535 *p > unimax || *p < 0 ||
3536 #endif
3537 end-s < Py_UNICODE_SIZE
3538 )
3539 {
3540 startinpos = s - starts;
3541 if (end-s < Py_UNICODE_SIZE) {
3542 endinpos = end-starts;
3543 reason = "truncated input";
3544 }
3545 else {
3546 endinpos = s - starts + Py_UNICODE_SIZE;
3547 reason = "illegal code point (> 0x10FFFF)";
3548 }
3549 outpos = p - PyUnicode_AS_UNICODE(v);
3550 if (unicode_decode_call_errorhandler(
3551 errors, &errorHandler,
3552 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003553 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003554 (PyObject **)&v, &outpos, &p)) {
3555 goto onError;
3556 }
3557 }
3558 else {
3559 p++;
3560 s += Py_UNICODE_SIZE;
3561 }
3562 }
3563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003564 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003565 goto onError;
3566 Py_XDECREF(errorHandler);
3567 Py_XDECREF(exc);
3568 return (PyObject *)v;
3569
3570 onError:
3571 Py_XDECREF(v);
3572 Py_XDECREF(errorHandler);
3573 Py_XDECREF(exc);
3574 return NULL;
3575}
3576
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577/* --- Latin-1 Codec ------------------------------------------------------ */
3578
3579PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003580 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581 const char *errors)
3582{
3583 PyUnicodeObject *v;
3584 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003585
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003587 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003588 Py_UNICODE r = *(unsigned char*)s;
3589 return PyUnicode_FromUnicode(&r, 1);
3590 }
3591
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 v = _PyUnicode_New(size);
3593 if (v == NULL)
3594 goto onError;
3595 if (size == 0)
3596 return (PyObject *)v;
3597 p = PyUnicode_AS_UNICODE(v);
3598 while (size-- > 0)
3599 *p++ = (unsigned char)*s++;
3600 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003601
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602 onError:
3603 Py_XDECREF(v);
3604 return NULL;
3605}
3606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607/* create or adjust a UnicodeEncodeError */
3608static void make_encode_exception(PyObject **exceptionObject,
3609 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003610 const Py_UNICODE *unicode, Py_ssize_t size,
3611 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 if (*exceptionObject == NULL) {
3615 *exceptionObject = PyUnicodeEncodeError_Create(
3616 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 }
3618 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3620 goto onError;
3621 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3622 goto onError;
3623 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3624 goto onError;
3625 return;
3626 onError:
3627 Py_DECREF(*exceptionObject);
3628 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 }
3630}
3631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632/* raises a UnicodeEncodeError */
3633static void raise_encode_exception(PyObject **exceptionObject,
3634 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003635 const Py_UNICODE *unicode, Py_ssize_t size,
3636 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 const char *reason)
3638{
3639 make_encode_exception(exceptionObject,
3640 encoding, unicode, size, startpos, endpos, reason);
3641 if (*exceptionObject != NULL)
3642 PyCodec_StrictErrors(*exceptionObject);
3643}
3644
3645/* error handling callback helper:
3646 build arguments, call the callback and check the arguments,
3647 put the result into newpos and return the replacement string, which
3648 has to be freed by the caller */
3649static PyObject *unicode_encode_call_errorhandler(const char *errors,
3650 PyObject **errorHandler,
3651 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003652 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3653 Py_ssize_t startpos, Py_ssize_t endpos,
3654 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003656 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657
3658 PyObject *restuple;
3659 PyObject *resunicode;
3660
3661 if (*errorHandler == NULL) {
3662 *errorHandler = PyCodec_LookupError(errors);
3663 if (*errorHandler == NULL)
3664 return NULL;
3665 }
3666
3667 make_encode_exception(exceptionObject,
3668 encoding, unicode, size, startpos, endpos, reason);
3669 if (*exceptionObject == NULL)
3670 return NULL;
3671
3672 restuple = PyObject_CallFunctionObjArgs(
3673 *errorHandler, *exceptionObject, NULL);
3674 if (restuple == NULL)
3675 return NULL;
3676 if (!PyTuple_Check(restuple)) {
3677 PyErr_Format(PyExc_TypeError, &argparse[4]);
3678 Py_DECREF(restuple);
3679 return NULL;
3680 }
3681 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3682 &resunicode, newpos)) {
3683 Py_DECREF(restuple);
3684 return NULL;
3685 }
3686 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003687 *newpos = size+*newpos;
3688 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003689 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003690 Py_DECREF(restuple);
3691 return NULL;
3692 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 Py_INCREF(resunicode);
3694 Py_DECREF(restuple);
3695 return resunicode;
3696}
3697
3698static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003699 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 const char *errors,
3701 int limit)
3702{
3703 /* output object */
3704 PyObject *res;
3705 /* pointers to the beginning and end+1 of input */
3706 const Py_UNICODE *startp = p;
3707 const Py_UNICODE *endp = p + size;
3708 /* pointer to the beginning of the unencodable characters */
3709 /* const Py_UNICODE *badp = NULL; */
3710 /* pointer into the output */
3711 char *str;
3712 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003713 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003714 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3715 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 PyObject *errorHandler = NULL;
3717 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003718 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 /* the following variable is used for caching string comparisons
3720 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3721 int known_errorHandler = -1;
3722
3723 /* allocate enough for a simple encoding without
3724 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003725 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003726 return PyBytes_FromStringAndSize(NULL, 0);
Christian Heimes9c4756e2008-05-26 13:22:05 +00003727 res = PyByteArray_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003729 return NULL;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003730 str = PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 ressize = size;
3732
3733 while (p<endp) {
3734 Py_UNICODE c = *p;
3735
3736 /* can we encode this? */
3737 if (c<limit) {
3738 /* no overflow check, because we know that the space is enough */
3739 *str++ = (char)c;
3740 ++p;
3741 }
3742 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003743 Py_ssize_t unicodepos = p-startp;
3744 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003746 Py_ssize_t repsize;
3747 Py_ssize_t newpos;
3748 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 Py_UNICODE *uni2;
3750 /* startpos for collecting unencodable chars */
3751 const Py_UNICODE *collstart = p;
3752 const Py_UNICODE *collend = p;
3753 /* find all unecodable characters */
3754 while ((collend < endp) && ((*collend)>=limit))
3755 ++collend;
3756 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3757 if (known_errorHandler==-1) {
3758 if ((errors==NULL) || (!strcmp(errors, "strict")))
3759 known_errorHandler = 1;
3760 else if (!strcmp(errors, "replace"))
3761 known_errorHandler = 2;
3762 else if (!strcmp(errors, "ignore"))
3763 known_errorHandler = 3;
3764 else if (!strcmp(errors, "xmlcharrefreplace"))
3765 known_errorHandler = 4;
3766 else
3767 known_errorHandler = 0;
3768 }
3769 switch (known_errorHandler) {
3770 case 1: /* strict */
3771 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3772 goto onError;
3773 case 2: /* replace */
3774 while (collstart++<collend)
3775 *str++ = '?'; /* fall through */
3776 case 3: /* ignore */
3777 p = collend;
3778 break;
3779 case 4: /* xmlcharrefreplace */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003780 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 /* determine replacement size (temporarily (mis)uses p) */
3782 for (p = collstart, repsize = 0; p < collend; ++p) {
3783 if (*p<10)
3784 repsize += 2+1+1;
3785 else if (*p<100)
3786 repsize += 2+2+1;
3787 else if (*p<1000)
3788 repsize += 2+3+1;
3789 else if (*p<10000)
3790 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003791#ifndef Py_UNICODE_WIDE
3792 else
3793 repsize += 2+5+1;
3794#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 else if (*p<100000)
3796 repsize += 2+5+1;
3797 else if (*p<1000000)
3798 repsize += 2+6+1;
3799 else
3800 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003801#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003802 }
3803 requiredsize = respos+repsize+(endp-collend);
3804 if (requiredsize > ressize) {
3805 if (requiredsize<2*ressize)
3806 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003807 if (PyByteArray_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 goto onError;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003809 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 ressize = requiredsize;
3811 }
3812 /* generate replacement (temporarily (mis)uses p) */
3813 for (p = collstart; p < collend; ++p) {
3814 str += sprintf(str, "&#%d;", (int)*p);
3815 }
3816 p = collend;
3817 break;
3818 default:
3819 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3820 encoding, reason, startp, size, &exc,
3821 collstart-startp, collend-startp, &newpos);
3822 if (repunicode == NULL)
3823 goto onError;
3824 /* need more space? (at least enough for what we
3825 have+the replacement+the rest of the string, so
3826 we won't have to check space for encodable characters) */
Christian Heimes9c4756e2008-05-26 13:22:05 +00003827 respos = str - PyByteArray_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 repsize = PyUnicode_GET_SIZE(repunicode);
3829 requiredsize = respos+repsize+(endp-collend);
3830 if (requiredsize > ressize) {
3831 if (requiredsize<2*ressize)
3832 requiredsize = 2*ressize;
Christian Heimes9c4756e2008-05-26 13:22:05 +00003833 if (PyByteArray_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 Py_DECREF(repunicode);
3835 goto onError;
3836 }
Christian Heimes9c4756e2008-05-26 13:22:05 +00003837 str = PyByteArray_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 ressize = requiredsize;
3839 }
3840 /* check if there is anything unencodable in the replacement
3841 and copy it to the output */
3842 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3843 c = *uni2;
3844 if (c >= limit) {
3845 raise_encode_exception(&exc, encoding, startp, size,
3846 unicodepos, unicodepos+1, reason);
3847 Py_DECREF(repunicode);
3848 goto onError;
3849 }
3850 *str = (char)c;
3851 }
3852 p = startp + newpos;
3853 Py_DECREF(repunicode);
3854 }
3855 }
3856 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003857 result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
Christian Heimes9c4756e2008-05-26 13:22:05 +00003858 str - PyByteArray_AS_STRING(res));
Guido van Rossum98297ee2007-11-06 21:34:58 +00003859 onError:
3860 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 Py_XDECREF(errorHandler);
3862 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003863 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864}
3865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003867 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 const char *errors)
3869{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871}
3872
3873PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3874{
3875 if (!PyUnicode_Check(unicode)) {
3876 PyErr_BadArgument();
3877 return NULL;
3878 }
3879 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3880 PyUnicode_GET_SIZE(unicode),
3881 NULL);
3882}
3883
3884/* --- 7-bit ASCII Codec -------------------------------------------------- */
3885
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003887 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 const char *errors)
3889{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 PyUnicodeObject *v;
3892 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003893 Py_ssize_t startinpos;
3894 Py_ssize_t endinpos;
3895 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 const char *e;
3897 PyObject *errorHandler = NULL;
3898 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003899
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003901 if (size == 1 && *(unsigned char*)s < 128) {
3902 Py_UNICODE r = *(unsigned char*)s;
3903 return PyUnicode_FromUnicode(&r, 1);
3904 }
Tim Petersced69f82003-09-16 20:30:58 +00003905
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 v = _PyUnicode_New(size);
3907 if (v == NULL)
3908 goto onError;
3909 if (size == 0)
3910 return (PyObject *)v;
3911 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 e = s + size;
3913 while (s < e) {
3914 register unsigned char c = (unsigned char)*s;
3915 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 ++s;
3918 }
3919 else {
3920 startinpos = s-starts;
3921 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003922 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 if (unicode_decode_call_errorhandler(
3924 errors, &errorHandler,
3925 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003926 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003931 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003932 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003933 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 Py_XDECREF(errorHandler);
3935 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003937
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 onError:
3939 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940 Py_XDECREF(errorHandler);
3941 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 return NULL;
3943}
3944
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 const char *errors)
3948{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950}
3951
3952PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3953{
3954 if (!PyUnicode_Check(unicode)) {
3955 PyErr_BadArgument();
3956 return NULL;
3957 }
3958 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3959 PyUnicode_GET_SIZE(unicode),
3960 NULL);
3961}
3962
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003963#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003964
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003965/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003966
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003967#if SIZEOF_INT < SIZEOF_SSIZE_T
3968#define NEED_RETRY
3969#endif
3970
3971/* XXX This code is limited to "true" double-byte encodings, as
3972 a) it assumes an incomplete character consists of a single byte, and
3973 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3974 encodings, see IsDBCSLeadByteEx documentation. */
3975
3976static int is_dbcs_lead_byte(const char *s, int offset)
3977{
3978 const char *curr = s + offset;
3979
3980 if (IsDBCSLeadByte(*curr)) {
3981 const char *prev = CharPrev(s, curr);
3982 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3983 }
3984 return 0;
3985}
3986
3987/*
3988 * Decode MBCS string into unicode object. If 'final' is set, converts
3989 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3990 */
3991static int decode_mbcs(PyUnicodeObject **v,
3992 const char *s, /* MBCS string */
3993 int size, /* sizeof MBCS string */
3994 int final)
3995{
3996 Py_UNICODE *p;
3997 Py_ssize_t n = 0;
3998 int usize = 0;
3999
4000 assert(size >= 0);
4001
4002 /* Skip trailing lead-byte unless 'final' is set */
4003 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4004 --size;
4005
4006 /* First get the size of the result */
4007 if (size > 0) {
4008 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4009 if (usize == 0) {
4010 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4011 return -1;
4012 }
4013 }
4014
4015 if (*v == NULL) {
4016 /* Create unicode object */
4017 *v = _PyUnicode_New(usize);
4018 if (*v == NULL)
4019 return -1;
4020 }
4021 else {
4022 /* Extend unicode object */
4023 n = PyUnicode_GET_SIZE(*v);
4024 if (_PyUnicode_Resize(v, n + usize) < 0)
4025 return -1;
4026 }
4027
4028 /* Do the conversion */
4029 if (size > 0) {
4030 p = PyUnicode_AS_UNICODE(*v) + n;
4031 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4032 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4033 return -1;
4034 }
4035 }
4036
4037 return size;
4038}
4039
4040PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4041 Py_ssize_t size,
4042 const char *errors,
4043 Py_ssize_t *consumed)
4044{
4045 PyUnicodeObject *v = NULL;
4046 int done;
4047
4048 if (consumed)
4049 *consumed = 0;
4050
4051#ifdef NEED_RETRY
4052 retry:
4053 if (size > INT_MAX)
4054 done = decode_mbcs(&v, s, INT_MAX, 0);
4055 else
4056#endif
4057 done = decode_mbcs(&v, s, (int)size, !consumed);
4058
4059 if (done < 0) {
4060 Py_XDECREF(v);
4061 return NULL;
4062 }
4063
4064 if (consumed)
4065 *consumed += done;
4066
4067#ifdef NEED_RETRY
4068 if (size > INT_MAX) {
4069 s += done;
4070 size -= done;
4071 goto retry;
4072 }
4073#endif
4074
4075 return (PyObject *)v;
4076}
4077
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004078PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004079 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004080 const char *errors)
4081{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004082 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4083}
4084
4085/*
4086 * Convert unicode into string object (MBCS).
4087 * Returns 0 if succeed, -1 otherwise.
4088 */
4089static int encode_mbcs(PyObject **repr,
4090 const Py_UNICODE *p, /* unicode */
4091 int size) /* size of unicode */
4092{
4093 int mbcssize = 0;
4094 Py_ssize_t n = 0;
4095
4096 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004097
4098 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004099 if (size > 0) {
4100 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4101 if (mbcssize == 0) {
4102 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4103 return -1;
4104 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004105 }
4106
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004107 if (*repr == NULL) {
4108 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004109 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004110 if (*repr == NULL)
4111 return -1;
4112 }
4113 else {
4114 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004115 n = PyBytes_Size(*repr);
4116 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004117 return -1;
4118 }
4119
4120 /* Do the conversion */
4121 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004122 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004123 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4124 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4125 return -1;
4126 }
4127 }
4128
4129 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004130}
4131
4132PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004133 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004134 const char *errors)
4135{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004136 PyObject *repr = NULL;
4137 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004138
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004139#ifdef NEED_RETRY
4140 retry:
4141 if (size > INT_MAX)
4142 ret = encode_mbcs(&repr, p, INT_MAX);
4143 else
4144#endif
4145 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004146
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004147 if (ret < 0) {
4148 Py_XDECREF(repr);
4149 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004150 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004151
4152#ifdef NEED_RETRY
4153 if (size > INT_MAX) {
4154 p += INT_MAX;
4155 size -= INT_MAX;
4156 goto retry;
4157 }
4158#endif
4159
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004160 return repr;
4161}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004162
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004163PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4164{
4165 if (!PyUnicode_Check(unicode)) {
4166 PyErr_BadArgument();
4167 return NULL;
4168 }
4169 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4170 PyUnicode_GET_SIZE(unicode),
4171 NULL);
4172}
4173
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004174#undef NEED_RETRY
4175
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004176#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004177
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178/* --- Character Mapping Codec -------------------------------------------- */
4179
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004181 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 PyObject *mapping,
4183 const char *errors)
4184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004186 Py_ssize_t startinpos;
4187 Py_ssize_t endinpos;
4188 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 PyUnicodeObject *v;
4191 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004192 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 PyObject *errorHandler = NULL;
4194 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004195 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004196 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004197
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 /* Default to Latin-1 */
4199 if (mapping == NULL)
4200 return PyUnicode_DecodeLatin1(s, size, errors);
4201
4202 v = _PyUnicode_New(size);
4203 if (v == NULL)
4204 goto onError;
4205 if (size == 0)
4206 return (PyObject *)v;
4207 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004209 if (PyUnicode_CheckExact(mapping)) {
4210 mapstring = PyUnicode_AS_UNICODE(mapping);
4211 maplen = PyUnicode_GET_SIZE(mapping);
4212 while (s < e) {
4213 unsigned char ch = *s;
4214 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004216 if (ch < maplen)
4217 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004219 if (x == 0xfffe) {
4220 /* undefined mapping */
4221 outpos = p-PyUnicode_AS_UNICODE(v);
4222 startinpos = s-starts;
4223 endinpos = startinpos+1;
4224 if (unicode_decode_call_errorhandler(
4225 errors, &errorHandler,
4226 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004227 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004228 (PyObject **)&v, &outpos, &p)) {
4229 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004230 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004231 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004232 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004233 *p++ = x;
4234 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004236 }
4237 else {
4238 while (s < e) {
4239 unsigned char ch = *s;
4240 PyObject *w, *x;
4241
4242 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004243 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004244 if (w == NULL)
4245 goto onError;
4246 x = PyObject_GetItem(mapping, w);
4247 Py_DECREF(w);
4248 if (x == NULL) {
4249 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4250 /* No mapping found means: mapping is undefined. */
4251 PyErr_Clear();
4252 x = Py_None;
4253 Py_INCREF(x);
4254 } else
4255 goto onError;
4256 }
4257
4258 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004259 if (PyLong_Check(x)) {
4260 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004261 if (value < 0 || value > 65535) {
4262 PyErr_SetString(PyExc_TypeError,
4263 "character mapping must be in range(65536)");
4264 Py_DECREF(x);
4265 goto onError;
4266 }
4267 *p++ = (Py_UNICODE)value;
4268 }
4269 else if (x == Py_None) {
4270 /* undefined mapping */
4271 outpos = p-PyUnicode_AS_UNICODE(v);
4272 startinpos = s-starts;
4273 endinpos = startinpos+1;
4274 if (unicode_decode_call_errorhandler(
4275 errors, &errorHandler,
4276 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004277 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004278 (PyObject **)&v, &outpos, &p)) {
4279 Py_DECREF(x);
4280 goto onError;
4281 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004282 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004283 continue;
4284 }
4285 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004286 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004287
4288 if (targetsize == 1)
4289 /* 1-1 mapping */
4290 *p++ = *PyUnicode_AS_UNICODE(x);
4291
4292 else if (targetsize > 1) {
4293 /* 1-n mapping */
4294 if (targetsize > extrachars) {
4295 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004296 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4297 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004298 (targetsize << 2);
4299 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004300 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004301 if (_PyUnicode_Resize(&v,
4302 PyUnicode_GET_SIZE(v) + needed) < 0) {
4303 Py_DECREF(x);
4304 goto onError;
4305 }
4306 p = PyUnicode_AS_UNICODE(v) + oldpos;
4307 }
4308 Py_UNICODE_COPY(p,
4309 PyUnicode_AS_UNICODE(x),
4310 targetsize);
4311 p += targetsize;
4312 extrachars -= targetsize;
4313 }
4314 /* 1-0 mapping: skip the character */
4315 }
4316 else {
4317 /* wrong return value */
4318 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004319 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004320 Py_DECREF(x);
4321 goto onError;
4322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004324 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
4327 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004328 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 Py_XDECREF(errorHandler);
4331 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004333
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 Py_XDECREF(errorHandler);
4336 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337 Py_XDECREF(v);
4338 return NULL;
4339}
4340
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341/* Charmap encoding: the lookup table */
4342
4343struct encoding_map{
4344 PyObject_HEAD
4345 unsigned char level1[32];
4346 int count2, count3;
4347 unsigned char level23[1];
4348};
4349
4350static PyObject*
4351encoding_map_size(PyObject *obj, PyObject* args)
4352{
4353 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004354 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004355 128*map->count3);
4356}
4357
4358static PyMethodDef encoding_map_methods[] = {
4359 {"size", encoding_map_size, METH_NOARGS,
4360 PyDoc_STR("Return the size (in bytes) of this object") },
4361 { 0 }
4362};
4363
4364static void
4365encoding_map_dealloc(PyObject* o)
4366{
4367 PyObject_FREE(o);
4368}
4369
4370static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004371 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004372 "EncodingMap", /*tp_name*/
4373 sizeof(struct encoding_map), /*tp_basicsize*/
4374 0, /*tp_itemsize*/
4375 /* methods */
4376 encoding_map_dealloc, /*tp_dealloc*/
4377 0, /*tp_print*/
4378 0, /*tp_getattr*/
4379 0, /*tp_setattr*/
4380 0, /*tp_compare*/
4381 0, /*tp_repr*/
4382 0, /*tp_as_number*/
4383 0, /*tp_as_sequence*/
4384 0, /*tp_as_mapping*/
4385 0, /*tp_hash*/
4386 0, /*tp_call*/
4387 0, /*tp_str*/
4388 0, /*tp_getattro*/
4389 0, /*tp_setattro*/
4390 0, /*tp_as_buffer*/
4391 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4392 0, /*tp_doc*/
4393 0, /*tp_traverse*/
4394 0, /*tp_clear*/
4395 0, /*tp_richcompare*/
4396 0, /*tp_weaklistoffset*/
4397 0, /*tp_iter*/
4398 0, /*tp_iternext*/
4399 encoding_map_methods, /*tp_methods*/
4400 0, /*tp_members*/
4401 0, /*tp_getset*/
4402 0, /*tp_base*/
4403 0, /*tp_dict*/
4404 0, /*tp_descr_get*/
4405 0, /*tp_descr_set*/
4406 0, /*tp_dictoffset*/
4407 0, /*tp_init*/
4408 0, /*tp_alloc*/
4409 0, /*tp_new*/
4410 0, /*tp_free*/
4411 0, /*tp_is_gc*/
4412};
4413
4414PyObject*
4415PyUnicode_BuildEncodingMap(PyObject* string)
4416{
4417 Py_UNICODE *decode;
4418 PyObject *result;
4419 struct encoding_map *mresult;
4420 int i;
4421 int need_dict = 0;
4422 unsigned char level1[32];
4423 unsigned char level2[512];
4424 unsigned char *mlevel1, *mlevel2, *mlevel3;
4425 int count2 = 0, count3 = 0;
4426
4427 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4428 PyErr_BadArgument();
4429 return NULL;
4430 }
4431 decode = PyUnicode_AS_UNICODE(string);
4432 memset(level1, 0xFF, sizeof level1);
4433 memset(level2, 0xFF, sizeof level2);
4434
4435 /* If there isn't a one-to-one mapping of NULL to \0,
4436 or if there are non-BMP characters, we need to use
4437 a mapping dictionary. */
4438 if (decode[0] != 0)
4439 need_dict = 1;
4440 for (i = 1; i < 256; i++) {
4441 int l1, l2;
4442 if (decode[i] == 0
4443 #ifdef Py_UNICODE_WIDE
4444 || decode[i] > 0xFFFF
4445 #endif
4446 ) {
4447 need_dict = 1;
4448 break;
4449 }
4450 if (decode[i] == 0xFFFE)
4451 /* unmapped character */
4452 continue;
4453 l1 = decode[i] >> 11;
4454 l2 = decode[i] >> 7;
4455 if (level1[l1] == 0xFF)
4456 level1[l1] = count2++;
4457 if (level2[l2] == 0xFF)
4458 level2[l2] = count3++;
4459 }
4460
4461 if (count2 >= 0xFF || count3 >= 0xFF)
4462 need_dict = 1;
4463
4464 if (need_dict) {
4465 PyObject *result = PyDict_New();
4466 PyObject *key, *value;
4467 if (!result)
4468 return NULL;
4469 for (i = 0; i < 256; i++) {
4470 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004471 key = PyLong_FromLong(decode[i]);
4472 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004473 if (!key || !value)
4474 goto failed1;
4475 if (PyDict_SetItem(result, key, value) == -1)
4476 goto failed1;
4477 Py_DECREF(key);
4478 Py_DECREF(value);
4479 }
4480 return result;
4481 failed1:
4482 Py_XDECREF(key);
4483 Py_XDECREF(value);
4484 Py_DECREF(result);
4485 return NULL;
4486 }
4487
4488 /* Create a three-level trie */
4489 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4490 16*count2 + 128*count3 - 1);
4491 if (!result)
4492 return PyErr_NoMemory();
4493 PyObject_Init(result, &EncodingMapType);
4494 mresult = (struct encoding_map*)result;
4495 mresult->count2 = count2;
4496 mresult->count3 = count3;
4497 mlevel1 = mresult->level1;
4498 mlevel2 = mresult->level23;
4499 mlevel3 = mresult->level23 + 16*count2;
4500 memcpy(mlevel1, level1, 32);
4501 memset(mlevel2, 0xFF, 16*count2);
4502 memset(mlevel3, 0, 128*count3);
4503 count3 = 0;
4504 for (i = 1; i < 256; i++) {
4505 int o1, o2, o3, i2, i3;
4506 if (decode[i] == 0xFFFE)
4507 /* unmapped character */
4508 continue;
4509 o1 = decode[i]>>11;
4510 o2 = (decode[i]>>7) & 0xF;
4511 i2 = 16*mlevel1[o1] + o2;
4512 if (mlevel2[i2] == 0xFF)
4513 mlevel2[i2] = count3++;
4514 o3 = decode[i] & 0x7F;
4515 i3 = 128*mlevel2[i2] + o3;
4516 mlevel3[i3] = i;
4517 }
4518 return result;
4519}
4520
4521static int
4522encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4523{
4524 struct encoding_map *map = (struct encoding_map*)mapping;
4525 int l1 = c>>11;
4526 int l2 = (c>>7) & 0xF;
4527 int l3 = c & 0x7F;
4528 int i;
4529
4530#ifdef Py_UNICODE_WIDE
4531 if (c > 0xFFFF) {
4532 return -1;
4533 }
4534#endif
4535 if (c == 0)
4536 return 0;
4537 /* level 1*/
4538 i = map->level1[l1];
4539 if (i == 0xFF) {
4540 return -1;
4541 }
4542 /* level 2*/
4543 i = map->level23[16*i+l2];
4544 if (i == 0xFF) {
4545 return -1;
4546 }
4547 /* level 3 */
4548 i = map->level23[16*map->count2 + 128*i + l3];
4549 if (i == 0) {
4550 return -1;
4551 }
4552 return i;
4553}
4554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555/* Lookup the character ch in the mapping. If the character
4556 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004557 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559{
Christian Heimes217cfd12007-12-02 14:31:20 +00004560 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 PyObject *x;
4562
4563 if (w == NULL)
4564 return NULL;
4565 x = PyObject_GetItem(mapping, w);
4566 Py_DECREF(w);
4567 if (x == NULL) {
4568 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4569 /* No mapping found means: mapping is undefined. */
4570 PyErr_Clear();
4571 x = Py_None;
4572 Py_INCREF(x);
4573 return x;
4574 } else
4575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004577 else if (x == Py_None)
4578 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004579 else if (PyLong_Check(x)) {
4580 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 if (value < 0 || value > 255) {
4582 PyErr_SetString(PyExc_TypeError,
4583 "character mapping must be in range(256)");
4584 Py_DECREF(x);
4585 return NULL;
4586 }
4587 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004589 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004593 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004594 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004595 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 Py_DECREF(x);
4597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 }
4599}
4600
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004601static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004602charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004603{
Christian Heimes72b710a2008-05-26 13:28:38 +00004604 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004605 /* exponentially overallocate to minimize reallocations */
4606 if (requiredsize < 2*outsize)
4607 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004608 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004609 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004610 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004611}
4612
4613typedef enum charmapencode_result {
4614 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4615}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004617 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 space is available. Return a new reference to the object that
4619 was put in the output buffer, or Py_None, if the mapping was undefined
4620 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004621 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004623charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004624 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004626 PyObject *rep;
4627 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004628 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629
Christian Heimes90aa7642007-12-19 02:45:37 +00004630 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004631 int res = encoding_map_lookup(c, mapping);
4632 Py_ssize_t requiredsize = *outpos+1;
4633 if (res == -1)
4634 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004635 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004636 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004637 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004638 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004639 outstart[(*outpos)++] = (char)res;
4640 return enc_SUCCESS;
4641 }
4642
4643 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004645 return enc_EXCEPTION;
4646 else if (rep==Py_None) {
4647 Py_DECREF(rep);
4648 return enc_FAILED;
4649 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004650 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004652 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004653 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004655 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004657 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004658 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 }
4660 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004661 const char *repchars = PyBytes_AS_STRING(rep);
4662 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004663 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004664 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004665 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004667 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004669 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 memcpy(outstart + *outpos, repchars, repsize);
4671 *outpos += repsize;
4672 }
4673 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004674 Py_DECREF(rep);
4675 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676}
4677
4678/* handle an error in PyUnicode_EncodeCharmap
4679 Return 0 on success, -1 on error */
4680static
4681int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004682 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004684 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004685 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686{
4687 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004688 Py_ssize_t repsize;
4689 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 Py_UNICODE *uni2;
4691 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004692 Py_ssize_t collstartpos = *inpos;
4693 Py_ssize_t collendpos = *inpos+1;
4694 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 char *encoding = "charmap";
4696 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004697 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 /* find all unencodable characters */
4700 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004701 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004702 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004703 int res = encoding_map_lookup(p[collendpos], mapping);
4704 if (res != -1)
4705 break;
4706 ++collendpos;
4707 continue;
4708 }
4709
4710 rep = charmapencode_lookup(p[collendpos], mapping);
4711 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004713 else if (rep!=Py_None) {
4714 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004715 break;
4716 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004717 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 ++collendpos;
4719 }
4720 /* cache callback name lookup
4721 * (if not done yet, i.e. it's the first error) */
4722 if (*known_errorHandler==-1) {
4723 if ((errors==NULL) || (!strcmp(errors, "strict")))
4724 *known_errorHandler = 1;
4725 else if (!strcmp(errors, "replace"))
4726 *known_errorHandler = 2;
4727 else if (!strcmp(errors, "ignore"))
4728 *known_errorHandler = 3;
4729 else if (!strcmp(errors, "xmlcharrefreplace"))
4730 *known_errorHandler = 4;
4731 else
4732 *known_errorHandler = 0;
4733 }
4734 switch (*known_errorHandler) {
4735 case 1: /* strict */
4736 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4737 return -1;
4738 case 2: /* replace */
4739 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4740 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004741 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 return -1;
4743 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004744 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4746 return -1;
4747 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 }
4749 /* fall through */
4750 case 3: /* ignore */
4751 *inpos = collendpos;
4752 break;
4753 case 4: /* xmlcharrefreplace */
4754 /* generate replacement (temporarily (mis)uses p) */
4755 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4756 char buffer[2+29+1+1];
4757 char *cp;
4758 sprintf(buffer, "&#%d;", (int)p[collpos]);
4759 for (cp = buffer; *cp; ++cp) {
4760 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004761 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004763 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4765 return -1;
4766 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 }
4768 }
4769 *inpos = collendpos;
4770 break;
4771 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004772 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 encoding, reason, p, size, exceptionObject,
4774 collstartpos, collendpos, &newpos);
4775 if (repunicode == NULL)
4776 return -1;
4777 /* generate replacement */
4778 repsize = PyUnicode_GET_SIZE(repunicode);
4779 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4780 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004781 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 return -1;
4783 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004784 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4787 return -1;
4788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 }
4790 *inpos = newpos;
4791 Py_DECREF(repunicode);
4792 }
4793 return 0;
4794}
4795
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004797 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 PyObject *mapping,
4799 const char *errors)
4800{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 /* output object */
4802 PyObject *res = NULL;
4803 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004804 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004806 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 PyObject *errorHandler = NULL;
4808 PyObject *exc = NULL;
4809 /* the following variable is used for caching string comparisons
4810 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4811 * 3=ignore, 4=xmlcharrefreplace */
4812 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813
4814 /* Default to Latin-1 */
4815 if (mapping == NULL)
4816 return PyUnicode_EncodeLatin1(p, size, errors);
4817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 /* allocate enough for a simple encoding without
4819 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00004820 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 if (res == NULL)
4822 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004823 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 while (inpos<size) {
4827 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004828 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004829 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004831 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 if (charmap_encoding_error(p, size, &inpos, mapping,
4833 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004834 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004835 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004836 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 else
4840 /* done with this character => adjust input position */
4841 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00004845 if (respos<PyBytes_GET_SIZE(res))
4846 _PyBytes_Resize(&res, respos);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004847
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 Py_XDECREF(exc);
4849 Py_XDECREF(errorHandler);
4850 return res;
4851
4852 onError:
4853 Py_XDECREF(res);
4854 Py_XDECREF(exc);
4855 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 return NULL;
4857}
4858
4859PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4860 PyObject *mapping)
4861{
4862 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4863 PyErr_BadArgument();
4864 return NULL;
4865 }
4866 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4867 PyUnicode_GET_SIZE(unicode),
4868 mapping,
4869 NULL);
4870}
4871
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872/* create or adjust a UnicodeTranslateError */
4873static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004874 const Py_UNICODE *unicode, Py_ssize_t size,
4875 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 if (*exceptionObject == NULL) {
4879 *exceptionObject = PyUnicodeTranslateError_Create(
4880 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 }
4882 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4884 goto onError;
4885 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4886 goto onError;
4887 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4888 goto onError;
4889 return;
4890 onError:
4891 Py_DECREF(*exceptionObject);
4892 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 }
4894}
4895
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896/* raises a UnicodeTranslateError */
4897static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004898 const Py_UNICODE *unicode, Py_ssize_t size,
4899 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 const char *reason)
4901{
4902 make_translate_exception(exceptionObject,
4903 unicode, size, startpos, endpos, reason);
4904 if (*exceptionObject != NULL)
4905 PyCodec_StrictErrors(*exceptionObject);
4906}
4907
4908/* error handling callback helper:
4909 build arguments, call the callback and check the arguments,
4910 put the result into newpos and return the replacement string, which
4911 has to be freed by the caller */
4912static PyObject *unicode_translate_call_errorhandler(const char *errors,
4913 PyObject **errorHandler,
4914 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004915 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4916 Py_ssize_t startpos, Py_ssize_t endpos,
4917 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004919 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004921 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 PyObject *restuple;
4923 PyObject *resunicode;
4924
4925 if (*errorHandler == NULL) {
4926 *errorHandler = PyCodec_LookupError(errors);
4927 if (*errorHandler == NULL)
4928 return NULL;
4929 }
4930
4931 make_translate_exception(exceptionObject,
4932 unicode, size, startpos, endpos, reason);
4933 if (*exceptionObject == NULL)
4934 return NULL;
4935
4936 restuple = PyObject_CallFunctionObjArgs(
4937 *errorHandler, *exceptionObject, NULL);
4938 if (restuple == NULL)
4939 return NULL;
4940 if (!PyTuple_Check(restuple)) {
4941 PyErr_Format(PyExc_TypeError, &argparse[4]);
4942 Py_DECREF(restuple);
4943 return NULL;
4944 }
4945 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004946 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 Py_DECREF(restuple);
4948 return NULL;
4949 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004950 if (i_newpos<0)
4951 *newpos = size+i_newpos;
4952 else
4953 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004954 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004955 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004956 Py_DECREF(restuple);
4957 return NULL;
4958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 Py_INCREF(resunicode);
4960 Py_DECREF(restuple);
4961 return resunicode;
4962}
4963
4964/* Lookup the character ch in the mapping and put the result in result,
4965 which must be decrefed by the caller.
4966 Return 0 on success, -1 on error */
4967static
4968int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4969{
Christian Heimes217cfd12007-12-02 14:31:20 +00004970 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971 PyObject *x;
4972
4973 if (w == NULL)
4974 return -1;
4975 x = PyObject_GetItem(mapping, w);
4976 Py_DECREF(w);
4977 if (x == NULL) {
4978 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4979 /* No mapping found means: use 1:1 mapping. */
4980 PyErr_Clear();
4981 *result = NULL;
4982 return 0;
4983 } else
4984 return -1;
4985 }
4986 else if (x == Py_None) {
4987 *result = x;
4988 return 0;
4989 }
Christian Heimes217cfd12007-12-02 14:31:20 +00004990 else if (PyLong_Check(x)) {
4991 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 long max = PyUnicode_GetMax();
4993 if (value < 0 || value > max) {
4994 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004995 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 Py_DECREF(x);
4997 return -1;
4998 }
4999 *result = x;
5000 return 0;
5001 }
5002 else if (PyUnicode_Check(x)) {
5003 *result = x;
5004 return 0;
5005 }
5006 else {
5007 /* wrong return value */
5008 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00005009 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00005010 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 return -1;
5012 }
5013}
5014/* ensure that *outobj is at least requiredsize characters long,
5015if not reallocate and adjust various state variables.
5016Return 0 on success, -1 on error */
5017static
Walter Dörwald4894c302003-10-24 14:25:28 +00005018int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005019 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005021 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005022 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005024 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00005026 if (requiredsize < 2 * oldsize)
5027 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005028 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029 return -1;
5030 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 }
5032 return 0;
5033}
5034/* lookup the character, put the result in the output string and adjust
5035 various state variables. Return a new reference to the object that
5036 was put in the output buffer in *result, or Py_None, if the mapping was
5037 undefined (in which case no character was written).
5038 The called must decref result.
5039 Return 0 on success, -1 on error. */
5040static
Walter Dörwald4894c302003-10-24 14:25:28 +00005041int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005042 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00005043 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005044{
Walter Dörwald4894c302003-10-24 14:25:28 +00005045 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046 return -1;
5047 if (*res==NULL) {
5048 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00005049 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 }
5051 else if (*res==Py_None)
5052 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005053 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005054 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005055 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 }
5057 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005058 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005059 if (repsize==1) {
5060 /* no overflow check, because we know that the space is enough */
5061 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5062 }
5063 else if (repsize!=0) {
5064 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005065 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005066 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005067 repsize - 1;
5068 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069 return -1;
5070 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5071 *outp += repsize;
5072 }
5073 }
5074 else
5075 return -1;
5076 return 0;
5077}
5078
5079PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005080 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 PyObject *mapping,
5082 const char *errors)
5083{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 /* output object */
5085 PyObject *res = NULL;
5086 /* pointers to the beginning and end+1 of input */
5087 const Py_UNICODE *startp = p;
5088 const Py_UNICODE *endp = p + size;
5089 /* pointer into the output */
5090 Py_UNICODE *str;
5091 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005092 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 char *reason = "character maps to <undefined>";
5094 PyObject *errorHandler = NULL;
5095 PyObject *exc = NULL;
5096 /* the following variable is used for caching string comparisons
5097 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5098 * 3=ignore, 4=xmlcharrefreplace */
5099 int known_errorHandler = -1;
5100
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 if (mapping == NULL) {
5102 PyErr_BadArgument();
5103 return NULL;
5104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105
5106 /* allocate enough for a simple 1:1 translation without
5107 replacements, if we need more, we'll resize */
5108 res = PyUnicode_FromUnicode(NULL, size);
5109 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005110 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 return res;
5113 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 while (p<endp) {
5116 /* try to encode it */
5117 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005118 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005119 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 goto onError;
5121 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005122 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 if (x!=Py_None) /* it worked => adjust input pointer */
5124 ++p;
5125 else { /* untranslatable character */
5126 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005127 Py_ssize_t repsize;
5128 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 Py_UNICODE *uni2;
5130 /* startpos for collecting untranslatable chars */
5131 const Py_UNICODE *collstart = p;
5132 const Py_UNICODE *collend = p+1;
5133 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005135 /* find all untranslatable characters */
5136 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005137 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 goto onError;
5139 Py_XDECREF(x);
5140 if (x!=Py_None)
5141 break;
5142 ++collend;
5143 }
5144 /* cache callback name lookup
5145 * (if not done yet, i.e. it's the first error) */
5146 if (known_errorHandler==-1) {
5147 if ((errors==NULL) || (!strcmp(errors, "strict")))
5148 known_errorHandler = 1;
5149 else if (!strcmp(errors, "replace"))
5150 known_errorHandler = 2;
5151 else if (!strcmp(errors, "ignore"))
5152 known_errorHandler = 3;
5153 else if (!strcmp(errors, "xmlcharrefreplace"))
5154 known_errorHandler = 4;
5155 else
5156 known_errorHandler = 0;
5157 }
5158 switch (known_errorHandler) {
5159 case 1: /* strict */
5160 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5161 goto onError;
5162 case 2: /* replace */
5163 /* No need to check for space, this is a 1:1 replacement */
5164 for (coll = collstart; coll<collend; ++coll)
5165 *str++ = '?';
5166 /* fall through */
5167 case 3: /* ignore */
5168 p = collend;
5169 break;
5170 case 4: /* xmlcharrefreplace */
5171 /* generate replacement (temporarily (mis)uses p) */
5172 for (p = collstart; p < collend; ++p) {
5173 char buffer[2+29+1+1];
5174 char *cp;
5175 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005176 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5178 goto onError;
5179 for (cp = buffer; *cp; ++cp)
5180 *str++ = *cp;
5181 }
5182 p = collend;
5183 break;
5184 default:
5185 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5186 reason, startp, size, &exc,
5187 collstart-startp, collend-startp, &newpos);
5188 if (repunicode == NULL)
5189 goto onError;
5190 /* generate replacement */
5191 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005192 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5194 Py_DECREF(repunicode);
5195 goto onError;
5196 }
5197 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5198 *str++ = *uni2;
5199 p = startp + newpos;
5200 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 }
5202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 /* Resize if we allocated to much */
5205 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005206 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00005207 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 }
5210 Py_XDECREF(exc);
5211 Py_XDECREF(errorHandler);
5212 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 onError:
5215 Py_XDECREF(res);
5216 Py_XDECREF(exc);
5217 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 return NULL;
5219}
5220
5221PyObject *PyUnicode_Translate(PyObject *str,
5222 PyObject *mapping,
5223 const char *errors)
5224{
5225 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005226
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 str = PyUnicode_FromObject(str);
5228 if (str == NULL)
5229 goto onError;
5230 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5231 PyUnicode_GET_SIZE(str),
5232 mapping,
5233 errors);
5234 Py_DECREF(str);
5235 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005236
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 onError:
5238 Py_XDECREF(str);
5239 return NULL;
5240}
Tim Petersced69f82003-09-16 20:30:58 +00005241
Guido van Rossum9e896b32000-04-05 20:11:21 +00005242/* --- Decimal Encoder ---------------------------------------------------- */
5243
5244int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005245 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005246 char *output,
5247 const char *errors)
5248{
5249 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 PyObject *errorHandler = NULL;
5251 PyObject *exc = NULL;
5252 const char *encoding = "decimal";
5253 const char *reason = "invalid decimal Unicode string";
5254 /* the following variable is used for caching string comparisons
5255 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5256 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005257
5258 if (output == NULL) {
5259 PyErr_BadArgument();
5260 return -1;
5261 }
5262
5263 p = s;
5264 end = s + length;
5265 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005267 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005269 Py_ssize_t repsize;
5270 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271 Py_UNICODE *uni2;
5272 Py_UNICODE *collstart;
5273 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005274
Guido van Rossum9e896b32000-04-05 20:11:21 +00005275 if (Py_UNICODE_ISSPACE(ch)) {
5276 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005277 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005278 continue;
5279 }
5280 decimal = Py_UNICODE_TODECIMAL(ch);
5281 if (decimal >= 0) {
5282 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005284 continue;
5285 }
Guido van Rossumba477042000-04-06 18:18:10 +00005286 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005287 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005288 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005289 continue;
5290 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005291 /* All other characters are considered unencodable */
5292 collstart = p;
5293 collend = p+1;
5294 while (collend < end) {
5295 if ((0 < *collend && *collend < 256) ||
5296 !Py_UNICODE_ISSPACE(*collend) ||
5297 Py_UNICODE_TODECIMAL(*collend))
5298 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005299 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005300 /* cache callback name lookup
5301 * (if not done yet, i.e. it's the first error) */
5302 if (known_errorHandler==-1) {
5303 if ((errors==NULL) || (!strcmp(errors, "strict")))
5304 known_errorHandler = 1;
5305 else if (!strcmp(errors, "replace"))
5306 known_errorHandler = 2;
5307 else if (!strcmp(errors, "ignore"))
5308 known_errorHandler = 3;
5309 else if (!strcmp(errors, "xmlcharrefreplace"))
5310 known_errorHandler = 4;
5311 else
5312 known_errorHandler = 0;
5313 }
5314 switch (known_errorHandler) {
5315 case 1: /* strict */
5316 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5317 goto onError;
5318 case 2: /* replace */
5319 for (p = collstart; p < collend; ++p)
5320 *output++ = '?';
5321 /* fall through */
5322 case 3: /* ignore */
5323 p = collend;
5324 break;
5325 case 4: /* xmlcharrefreplace */
5326 /* generate replacement (temporarily (mis)uses p) */
5327 for (p = collstart; p < collend; ++p)
5328 output += sprintf(output, "&#%d;", (int)*p);
5329 p = collend;
5330 break;
5331 default:
5332 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5333 encoding, reason, s, length, &exc,
5334 collstart-s, collend-s, &newpos);
5335 if (repunicode == NULL)
5336 goto onError;
5337 /* generate replacement */
5338 repsize = PyUnicode_GET_SIZE(repunicode);
5339 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5340 Py_UNICODE ch = *uni2;
5341 if (Py_UNICODE_ISSPACE(ch))
5342 *output++ = ' ';
5343 else {
5344 decimal = Py_UNICODE_TODECIMAL(ch);
5345 if (decimal >= 0)
5346 *output++ = '0' + decimal;
5347 else if (0 < ch && ch < 256)
5348 *output++ = (char)ch;
5349 else {
5350 Py_DECREF(repunicode);
5351 raise_encode_exception(&exc, encoding,
5352 s, length, collstart-s, collend-s, reason);
5353 goto onError;
5354 }
5355 }
5356 }
5357 p = s + newpos;
5358 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005359 }
5360 }
5361 /* 0-terminate the output string */
5362 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 Py_XDECREF(exc);
5364 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005365 return 0;
5366
5367 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368 Py_XDECREF(exc);
5369 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005370 return -1;
5371}
5372
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373/* --- Helpers ------------------------------------------------------------ */
5374
Eric Smith8c663262007-08-25 02:26:07 +00005375#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005376#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005377#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005378/* Include _ParseTupleFinds from find.h */
5379#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005380#include "stringlib/find.h"
5381#include "stringlib/partition.h"
5382
Eric Smith5807c412008-05-11 21:00:57 +00005383#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5384#include "stringlib/localeutil.h"
5385
Thomas Wouters477c8d52006-05-27 19:21:47 +00005386/* helper macro to fixup start/end slice values */
5387#define FIX_START_END(obj) \
5388 if (start < 0) \
5389 start += (obj)->length; \
5390 if (start < 0) \
5391 start = 0; \
5392 if (end > (obj)->length) \
5393 end = (obj)->length; \
5394 if (end < 0) \
5395 end += (obj)->length; \
5396 if (end < 0) \
5397 end = 0;
5398
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005400 PyObject *substr,
5401 Py_ssize_t start,
5402 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005404 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005405 PyUnicodeObject* str_obj;
5406 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005407
Thomas Wouters477c8d52006-05-27 19:21:47 +00005408 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5409 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005411 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5412 if (!sub_obj) {
5413 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 return -1;
5415 }
Tim Petersced69f82003-09-16 20:30:58 +00005416
Thomas Wouters477c8d52006-05-27 19:21:47 +00005417 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005418
Thomas Wouters477c8d52006-05-27 19:21:47 +00005419 result = stringlib_count(
5420 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5421 );
5422
5423 Py_DECREF(sub_obj);
5424 Py_DECREF(str_obj);
5425
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 return result;
5427}
5428
Martin v. Löwis18e16552006-02-15 17:27:45 +00005429Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005430 PyObject *sub,
5431 Py_ssize_t start,
5432 Py_ssize_t end,
5433 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005435 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005438 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005439 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005440 sub = PyUnicode_FromObject(sub);
5441 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005442 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005443 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 }
Tim Petersced69f82003-09-16 20:30:58 +00005445
Thomas Wouters477c8d52006-05-27 19:21:47 +00005446 if (direction > 0)
5447 result = stringlib_find_slice(
5448 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5449 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5450 start, end
5451 );
5452 else
5453 result = stringlib_rfind_slice(
5454 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5455 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5456 start, end
5457 );
5458
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005460 Py_DECREF(sub);
5461
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 return result;
5463}
5464
Tim Petersced69f82003-09-16 20:30:58 +00005465static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466int tailmatch(PyUnicodeObject *self,
5467 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005468 Py_ssize_t start,
5469 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 int direction)
5471{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 if (substring->length == 0)
5473 return 1;
5474
Thomas Wouters477c8d52006-05-27 19:21:47 +00005475 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476
5477 end -= substring->length;
5478 if (end < start)
5479 return 0;
5480
5481 if (direction > 0) {
5482 if (Py_UNICODE_MATCH(self, end, substring))
5483 return 1;
5484 } else {
5485 if (Py_UNICODE_MATCH(self, start, substring))
5486 return 1;
5487 }
5488
5489 return 0;
5490}
5491
Martin v. Löwis18e16552006-02-15 17:27:45 +00005492Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005494 Py_ssize_t start,
5495 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 int direction)
5497{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005498 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005499
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 str = PyUnicode_FromObject(str);
5501 if (str == NULL)
5502 return -1;
5503 substr = PyUnicode_FromObject(substr);
5504 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005505 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 return -1;
5507 }
Tim Petersced69f82003-09-16 20:30:58 +00005508
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 result = tailmatch((PyUnicodeObject *)str,
5510 (PyUnicodeObject *)substr,
5511 start, end, direction);
5512 Py_DECREF(str);
5513 Py_DECREF(substr);
5514 return result;
5515}
5516
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517/* Apply fixfct filter to the Unicode object self and return a
5518 reference to the modified object */
5519
Tim Petersced69f82003-09-16 20:30:58 +00005520static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521PyObject *fixup(PyUnicodeObject *self,
5522 int (*fixfct)(PyUnicodeObject *s))
5523{
5524
5525 PyUnicodeObject *u;
5526
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005527 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 if (u == NULL)
5529 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005530
5531 Py_UNICODE_COPY(u->str, self->str, self->length);
5532
Tim Peters7a29bd52001-09-12 03:03:31 +00005533 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 /* fixfct should return TRUE if it modified the buffer. If
5535 FALSE, return a reference to the original buffer instead
5536 (to save space, not time) */
5537 Py_INCREF(self);
5538 Py_DECREF(u);
5539 return (PyObject*) self;
5540 }
5541 return (PyObject*) u;
5542}
5543
Tim Petersced69f82003-09-16 20:30:58 +00005544static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545int fixupper(PyUnicodeObject *self)
5546{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005547 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 Py_UNICODE *s = self->str;
5549 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005550
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 while (len-- > 0) {
5552 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 ch = Py_UNICODE_TOUPPER(*s);
5555 if (ch != *s) {
5556 status = 1;
5557 *s = ch;
5558 }
5559 s++;
5560 }
5561
5562 return status;
5563}
5564
Tim Petersced69f82003-09-16 20:30:58 +00005565static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566int fixlower(PyUnicodeObject *self)
5567{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005568 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 Py_UNICODE *s = self->str;
5570 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005571
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 while (len-- > 0) {
5573 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005574
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 ch = Py_UNICODE_TOLOWER(*s);
5576 if (ch != *s) {
5577 status = 1;
5578 *s = ch;
5579 }
5580 s++;
5581 }
5582
5583 return status;
5584}
5585
Tim Petersced69f82003-09-16 20:30:58 +00005586static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587int fixswapcase(PyUnicodeObject *self)
5588{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005589 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 Py_UNICODE *s = self->str;
5591 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005592
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 while (len-- > 0) {
5594 if (Py_UNICODE_ISUPPER(*s)) {
5595 *s = Py_UNICODE_TOLOWER(*s);
5596 status = 1;
5597 } else if (Py_UNICODE_ISLOWER(*s)) {
5598 *s = Py_UNICODE_TOUPPER(*s);
5599 status = 1;
5600 }
5601 s++;
5602 }
5603
5604 return status;
5605}
5606
Tim Petersced69f82003-09-16 20:30:58 +00005607static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608int fixcapitalize(PyUnicodeObject *self)
5609{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005610 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005611 Py_UNICODE *s = self->str;
5612 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005613
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005614 if (len == 0)
5615 return 0;
5616 if (Py_UNICODE_ISLOWER(*s)) {
5617 *s = Py_UNICODE_TOUPPER(*s);
5618 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005620 s++;
5621 while (--len > 0) {
5622 if (Py_UNICODE_ISUPPER(*s)) {
5623 *s = Py_UNICODE_TOLOWER(*s);
5624 status = 1;
5625 }
5626 s++;
5627 }
5628 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629}
5630
5631static
5632int fixtitle(PyUnicodeObject *self)
5633{
5634 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5635 register Py_UNICODE *e;
5636 int previous_is_cased;
5637
5638 /* Shortcut for single character strings */
5639 if (PyUnicode_GET_SIZE(self) == 1) {
5640 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5641 if (*p != ch) {
5642 *p = ch;
5643 return 1;
5644 }
5645 else
5646 return 0;
5647 }
Tim Petersced69f82003-09-16 20:30:58 +00005648
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 e = p + PyUnicode_GET_SIZE(self);
5650 previous_is_cased = 0;
5651 for (; p < e; p++) {
5652 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005653
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 if (previous_is_cased)
5655 *p = Py_UNICODE_TOLOWER(ch);
5656 else
5657 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005658
5659 if (Py_UNICODE_ISLOWER(ch) ||
5660 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 Py_UNICODE_ISTITLE(ch))
5662 previous_is_cased = 1;
5663 else
5664 previous_is_cased = 0;
5665 }
5666 return 1;
5667}
5668
Tim Peters8ce9f162004-08-27 01:49:32 +00005669PyObject *
5670PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671{
Skip Montanaro6543b452004-09-16 03:28:13 +00005672 const Py_UNICODE blank = ' ';
5673 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005674 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005675 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005676 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5677 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005678 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5679 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005680 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005681 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682
Tim Peters05eba1f2004-08-27 21:32:02 +00005683 fseq = PySequence_Fast(seq, "");
5684 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005685 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005686 }
5687
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005688 /* NOTE: the following code can't call back into Python code,
5689 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005690 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005691
Tim Peters05eba1f2004-08-27 21:32:02 +00005692 seqlen = PySequence_Fast_GET_SIZE(fseq);
5693 /* If empty sequence, return u"". */
5694 if (seqlen == 0) {
5695 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5696 goto Done;
5697 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005698 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005699 /* If singleton sequence with an exact Unicode, return that. */
5700 if (seqlen == 1) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005701 item = items[0];
Tim Peters05eba1f2004-08-27 21:32:02 +00005702 if (PyUnicode_CheckExact(item)) {
5703 Py_INCREF(item);
5704 res = (PyUnicodeObject *)item;
5705 goto Done;
5706 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005707 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005708 else {
5709 /* Set up sep and seplen */
5710 if (separator == NULL) {
5711 sep = &blank;
5712 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005713 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005714 else {
5715 if (!PyUnicode_Check(separator)) {
5716 PyErr_Format(PyExc_TypeError,
5717 "separator: expected str instance,"
5718 " %.80s found",
5719 Py_TYPE(separator)->tp_name);
5720 goto onError;
5721 }
5722 sep = PyUnicode_AS_UNICODE(separator);
5723 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005724 }
5725 }
5726
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005727 /* There are at least two things to join, or else we have a subclass
5728 * of str in the sequence.
5729 * Do a pre-pass to figure out the total amount of space we'll
5730 * need (sz), and see whether all argument are strings.
5731 */
5732 sz = 0;
5733 for (i = 0; i < seqlen; i++) {
5734 const Py_ssize_t old_sz = sz;
5735 item = items[i];
Guido van Rossum98297ee2007-11-06 21:34:58 +00005736 if (!PyUnicode_Check(item)) {
5737 PyErr_Format(PyExc_TypeError,
5738 "sequence item %zd: expected str instance,"
5739 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005740 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005741 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005742 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005743 sz += PyUnicode_GET_SIZE(item);
5744 if (i != 0)
5745 sz += seplen;
5746 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5747 PyErr_SetString(PyExc_OverflowError,
5748 "join() result is too long for a Python string");
5749 goto onError;
5750 }
5751 }
Tim Petersced69f82003-09-16 20:30:58 +00005752
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005753 res = _PyUnicode_New(sz);
5754 if (res == NULL)
5755 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005756
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005757 /* Catenate everything. */
5758 res_p = PyUnicode_AS_UNICODE(res);
5759 for (i = 0; i < seqlen; ++i) {
5760 Py_ssize_t itemlen;
5761 item = items[i];
5762 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005763 /* Copy item, and maybe the separator. */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005764 if (i) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005765 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005766 res_p += seplen;
5767 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005768 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5769 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005770 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005771
Tim Peters8ce9f162004-08-27 01:49:32 +00005772 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005773 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 return (PyObject *)res;
5775
5776 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005777 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005778 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 return NULL;
5780}
5781
Tim Petersced69f82003-09-16 20:30:58 +00005782static
5783PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005784 Py_ssize_t left,
5785 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 Py_UNICODE fill)
5787{
5788 PyUnicodeObject *u;
5789
5790 if (left < 0)
5791 left = 0;
5792 if (right < 0)
5793 right = 0;
5794
Tim Peters7a29bd52001-09-12 03:03:31 +00005795 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 Py_INCREF(self);
5797 return self;
5798 }
5799
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005800 if (left > PY_SSIZE_T_MAX - self->length ||
5801 right > PY_SSIZE_T_MAX - (left + self->length)) {
5802 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5803 return NULL;
5804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 u = _PyUnicode_New(left + self->length + right);
5806 if (u) {
5807 if (left)
5808 Py_UNICODE_FILL(u->str, fill, left);
5809 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5810 if (right)
5811 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5812 }
5813
5814 return u;
5815}
5816
5817#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 if (!str) \
5820 goto onError; \
5821 if (PyList_Append(list, str)) { \
5822 Py_DECREF(str); \
5823 goto onError; \
5824 } \
5825 else \
5826 Py_DECREF(str);
5827
5828static
5829PyObject *split_whitespace(PyUnicodeObject *self,
5830 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 register Py_ssize_t i;
5834 register Py_ssize_t j;
5835 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005837 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838
5839 for (i = j = 0; i < len; ) {
5840 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005841 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 i++;
5843 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005844 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 i++;
5846 if (j < i) {
5847 if (maxcount-- <= 0)
5848 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005849 SPLIT_APPEND(buf, j, i);
5850 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 i++;
5852 j = i;
5853 }
5854 }
5855 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005856 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 }
5858 return list;
5859
5860 onError:
5861 Py_DECREF(list);
5862 return NULL;
5863}
5864
5865PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005866 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 register Py_ssize_t i;
5869 register Py_ssize_t j;
5870 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 PyObject *list;
5872 PyObject *str;
5873 Py_UNICODE *data;
5874
5875 string = PyUnicode_FromObject(string);
5876 if (string == NULL)
5877 return NULL;
5878 data = PyUnicode_AS_UNICODE(string);
5879 len = PyUnicode_GET_SIZE(string);
5880
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 list = PyList_New(0);
5882 if (!list)
5883 goto onError;
5884
5885 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005886 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005887
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005889 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
5892 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005893 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 if (i < len) {
5895 if (data[i] == '\r' && i + 1 < len &&
5896 data[i+1] == '\n')
5897 i += 2;
5898 else
5899 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005900 if (keepends)
5901 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 }
Guido van Rossum86662912000-04-11 15:38:46 +00005903 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 j = i;
5905 }
5906 if (j < len) {
5907 SPLIT_APPEND(data, j, len);
5908 }
5909
5910 Py_DECREF(string);
5911 return list;
5912
5913 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005914 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 Py_DECREF(string);
5916 return NULL;
5917}
5918
Tim Petersced69f82003-09-16 20:30:58 +00005919static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920PyObject *split_char(PyUnicodeObject *self,
5921 PyObject *list,
5922 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005923 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005925 register Py_ssize_t i;
5926 register Py_ssize_t j;
5927 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005929 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
5931 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005932 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 if (maxcount-- <= 0)
5934 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00005935 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 i = j = i + 1;
5937 } else
5938 i++;
5939 }
5940 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00005941 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 }
5943 return list;
5944
5945 onError:
5946 Py_DECREF(list);
5947 return NULL;
5948}
5949
Tim Petersced69f82003-09-16 20:30:58 +00005950static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951PyObject *split_substring(PyUnicodeObject *self,
5952 PyObject *list,
5953 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005954 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005956 register Py_ssize_t i;
5957 register Py_ssize_t j;
5958 Py_ssize_t len = self->length;
5959 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 PyObject *str;
5961
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005962 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 if (Py_UNICODE_MATCH(self, i, substring)) {
5964 if (maxcount-- <= 0)
5965 break;
5966 SPLIT_APPEND(self->str, j, i);
5967 i = j = i + sublen;
5968 } else
5969 i++;
5970 }
5971 if (j <= len) {
5972 SPLIT_APPEND(self->str, j, len);
5973 }
5974 return list;
5975
5976 onError:
5977 Py_DECREF(list);
5978 return NULL;
5979}
5980
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005981static
5982PyObject *rsplit_whitespace(PyUnicodeObject *self,
5983 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005984 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005985{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005986 register Py_ssize_t i;
5987 register Py_ssize_t j;
5988 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005989 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00005990 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005991
5992 for (i = j = len - 1; i >= 0; ) {
5993 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00005994 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005995 i--;
5996 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00005997 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005998 i--;
5999 if (j > i) {
6000 if (maxcount-- <= 0)
6001 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006002 SPLIT_APPEND(buf, i + 1, j + 1);
6003 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006004 i--;
6005 j = i;
6006 }
6007 }
6008 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006009 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006010 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006011 if (PyList_Reverse(list) < 0)
6012 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006013 return list;
6014
6015 onError:
6016 Py_DECREF(list);
6017 return NULL;
6018}
6019
6020static
6021PyObject *rsplit_char(PyUnicodeObject *self,
6022 PyObject *list,
6023 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006024 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006025{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006026 register Py_ssize_t i;
6027 register Py_ssize_t j;
6028 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006029 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006030 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006031
6032 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006033 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006034 if (maxcount-- <= 0)
6035 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006036 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006037 j = i = i - 1;
6038 } else
6039 i--;
6040 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006041 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006042 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006043 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006044 if (PyList_Reverse(list) < 0)
6045 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006046 return list;
6047
6048 onError:
6049 Py_DECREF(list);
6050 return NULL;
6051}
6052
6053static
6054PyObject *rsplit_substring(PyUnicodeObject *self,
6055 PyObject *list,
6056 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006057 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006058{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059 register Py_ssize_t i;
6060 register Py_ssize_t j;
6061 Py_ssize_t len = self->length;
6062 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006063 PyObject *str;
6064
6065 for (i = len - sublen, j = len; i >= 0; ) {
6066 if (Py_UNICODE_MATCH(self, i, substring)) {
6067 if (maxcount-- <= 0)
6068 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006069 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006070 j = i;
6071 i -= sublen;
6072 } else
6073 i--;
6074 }
6075 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006076 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006077 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006078 if (PyList_Reverse(list) < 0)
6079 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006080 return list;
6081
6082 onError:
6083 Py_DECREF(list);
6084 return NULL;
6085}
6086
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087#undef SPLIT_APPEND
6088
6089static
6090PyObject *split(PyUnicodeObject *self,
6091 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006092 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093{
6094 PyObject *list;
6095
6096 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006097 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
6099 list = PyList_New(0);
6100 if (!list)
6101 return NULL;
6102
6103 if (substring == NULL)
6104 return split_whitespace(self,list,maxcount);
6105
6106 else if (substring->length == 1)
6107 return split_char(self,list,substring->str[0],maxcount);
6108
6109 else if (substring->length == 0) {
6110 Py_DECREF(list);
6111 PyErr_SetString(PyExc_ValueError, "empty separator");
6112 return NULL;
6113 }
6114 else
6115 return split_substring(self,list,substring,maxcount);
6116}
6117
Tim Petersced69f82003-09-16 20:30:58 +00006118static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006119PyObject *rsplit(PyUnicodeObject *self,
6120 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006121 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006122{
6123 PyObject *list;
6124
6125 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006126 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006127
6128 list = PyList_New(0);
6129 if (!list)
6130 return NULL;
6131
6132 if (substring == NULL)
6133 return rsplit_whitespace(self,list,maxcount);
6134
6135 else if (substring->length == 1)
6136 return rsplit_char(self,list,substring->str[0],maxcount);
6137
6138 else if (substring->length == 0) {
6139 Py_DECREF(list);
6140 PyErr_SetString(PyExc_ValueError, "empty separator");
6141 return NULL;
6142 }
6143 else
6144 return rsplit_substring(self,list,substring,maxcount);
6145}
6146
6147static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148PyObject *replace(PyUnicodeObject *self,
6149 PyUnicodeObject *str1,
6150 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006151 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152{
6153 PyUnicodeObject *u;
6154
6155 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006156 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157
Thomas Wouters477c8d52006-05-27 19:21:47 +00006158 if (str1->length == str2->length) {
6159 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006160 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006161 if (str1->length == 1) {
6162 /* replace characters */
6163 Py_UNICODE u1, u2;
6164 if (!findchar(self->str, self->length, str1->str[0]))
6165 goto nothing;
6166 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6167 if (!u)
6168 return NULL;
6169 Py_UNICODE_COPY(u->str, self->str, self->length);
6170 u1 = str1->str[0];
6171 u2 = str2->str[0];
6172 for (i = 0; i < u->length; i++)
6173 if (u->str[i] == u1) {
6174 if (--maxcount < 0)
6175 break;
6176 u->str[i] = u2;
6177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006179 i = fastsearch(
6180 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006182 if (i < 0)
6183 goto nothing;
6184 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6185 if (!u)
6186 return NULL;
6187 Py_UNICODE_COPY(u->str, self->str, self->length);
6188 while (i <= self->length - str1->length)
6189 if (Py_UNICODE_MATCH(self, i, str1)) {
6190 if (--maxcount < 0)
6191 break;
6192 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6193 i += str1->length;
6194 } else
6195 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006198
6199 Py_ssize_t n, i, j, e;
6200 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 Py_UNICODE *p;
6202
6203 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006204 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 if (n > maxcount)
6206 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006207 if (n == 0)
6208 goto nothing;
6209 /* new_size = self->length + n * (str2->length - str1->length)); */
6210 delta = (str2->length - str1->length);
6211 if (delta == 0) {
6212 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006214 product = n * (str2->length - str1->length);
6215 if ((product / (str2->length - str1->length)) != n) {
6216 PyErr_SetString(PyExc_OverflowError,
6217 "replace string is too long");
6218 return NULL;
6219 }
6220 new_size = self->length + product;
6221 if (new_size < 0) {
6222 PyErr_SetString(PyExc_OverflowError,
6223 "replace string is too long");
6224 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 }
6226 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006227 u = _PyUnicode_New(new_size);
6228 if (!u)
6229 return NULL;
6230 i = 0;
6231 p = u->str;
6232 e = self->length - str1->length;
6233 if (str1->length > 0) {
6234 while (n-- > 0) {
6235 /* look for next match */
6236 j = i;
6237 while (j <= e) {
6238 if (Py_UNICODE_MATCH(self, j, str1))
6239 break;
6240 j++;
6241 }
6242 if (j > i) {
6243 if (j > e)
6244 break;
6245 /* copy unchanged part [i:j] */
6246 Py_UNICODE_COPY(p, self->str+i, j-i);
6247 p += j - i;
6248 }
6249 /* copy substitution string */
6250 if (str2->length > 0) {
6251 Py_UNICODE_COPY(p, str2->str, str2->length);
6252 p += str2->length;
6253 }
6254 i = j + str1->length;
6255 }
6256 if (i < self->length)
6257 /* copy tail [i:] */
6258 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6259 } else {
6260 /* interleave */
6261 while (n > 0) {
6262 Py_UNICODE_COPY(p, str2->str, str2->length);
6263 p += str2->length;
6264 if (--n <= 0)
6265 break;
6266 *p++ = self->str[i++];
6267 }
6268 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6269 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006272
6273nothing:
6274 /* nothing to replace; return original string (when possible) */
6275 if (PyUnicode_CheckExact(self)) {
6276 Py_INCREF(self);
6277 return (PyObject *) self;
6278 }
6279 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280}
6281
6282/* --- Unicode Object Methods --------------------------------------------- */
6283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006284PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006285"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286\n\
6287Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006288characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289
6290static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006291unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 return fixup(self, fixtitle);
6294}
6295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006296PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006297"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298\n\
6299Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006300have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
6302static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006303unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 return fixup(self, fixcapitalize);
6306}
6307
6308#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006309PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006310"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311\n\
6312Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006313normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314
6315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006316unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317{
6318 PyObject *list;
6319 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006320 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 /* Split into words */
6323 list = split(self, NULL, -1);
6324 if (!list)
6325 return NULL;
6326
6327 /* Capitalize each word */
6328 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6329 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6330 fixcapitalize);
6331 if (item == NULL)
6332 goto onError;
6333 Py_DECREF(PyList_GET_ITEM(list, i));
6334 PyList_SET_ITEM(list, i, item);
6335 }
6336
6337 /* Join the words to form a new string */
6338 item = PyUnicode_Join(NULL, list);
6339
6340onError:
6341 Py_DECREF(list);
6342 return (PyObject *)item;
6343}
6344#endif
6345
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006346/* Argument converter. Coerces to a single unicode character */
6347
6348static int
6349convert_uc(PyObject *obj, void *addr)
6350{
6351 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6352 PyObject *uniobj;
6353 Py_UNICODE *unistr;
6354
6355 uniobj = PyUnicode_FromObject(obj);
6356 if (uniobj == NULL) {
6357 PyErr_SetString(PyExc_TypeError,
6358 "The fill character cannot be converted to Unicode");
6359 return 0;
6360 }
6361 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6362 PyErr_SetString(PyExc_TypeError,
6363 "The fill character must be exactly one character long");
6364 Py_DECREF(uniobj);
6365 return 0;
6366 }
6367 unistr = PyUnicode_AS_UNICODE(uniobj);
6368 *fillcharloc = unistr[0];
6369 Py_DECREF(uniobj);
6370 return 1;
6371}
6372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006373PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006374"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006376Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006377done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378
6379static PyObject *
6380unicode_center(PyUnicodeObject *self, PyObject *args)
6381{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006382 Py_ssize_t marg, left;
6383 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006384 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385
Thomas Woutersde017742006-02-16 19:34:37 +00006386 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 return NULL;
6388
Tim Peters7a29bd52001-09-12 03:03:31 +00006389 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 Py_INCREF(self);
6391 return (PyObject*) self;
6392 }
6393
6394 marg = width - self->length;
6395 left = marg / 2 + (marg & width & 1);
6396
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006397 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398}
6399
Marc-André Lemburge5034372000-08-08 08:04:29 +00006400#if 0
6401
6402/* This code should go into some future Unicode collation support
6403 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006404 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006405
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006406/* speedy UTF-16 code point order comparison */
6407/* gleaned from: */
6408/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6409
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006410static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006411{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006412 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006413 0, 0, 0, 0, 0, 0, 0, 0,
6414 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006415 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006416};
6417
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418static int
6419unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6420{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006421 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006422
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 Py_UNICODE *s1 = str1->str;
6424 Py_UNICODE *s2 = str2->str;
6425
6426 len1 = str1->length;
6427 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006428
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006430 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006431
6432 c1 = *s1++;
6433 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006434
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006435 if (c1 > (1<<11) * 26)
6436 c1 += utf16Fixup[c1>>11];
6437 if (c2 > (1<<11) * 26)
6438 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006439 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006440
6441 if (c1 != c2)
6442 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006443
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006444 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 }
6446
6447 return (len1 < len2) ? -1 : (len1 != len2);
6448}
6449
Marc-André Lemburge5034372000-08-08 08:04:29 +00006450#else
6451
6452static int
6453unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006455 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006456
6457 Py_UNICODE *s1 = str1->str;
6458 Py_UNICODE *s2 = str2->str;
6459
6460 len1 = str1->length;
6461 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006462
Marc-André Lemburge5034372000-08-08 08:04:29 +00006463 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006464 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006465
Fredrik Lundh45714e92001-06-26 16:39:36 +00006466 c1 = *s1++;
6467 c2 = *s2++;
6468
6469 if (c1 != c2)
6470 return (c1 < c2) ? -1 : 1;
6471
Marc-André Lemburge5034372000-08-08 08:04:29 +00006472 len1--; len2--;
6473 }
6474
6475 return (len1 < len2) ? -1 : (len1 != len2);
6476}
6477
6478#endif
6479
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480int PyUnicode_Compare(PyObject *left,
6481 PyObject *right)
6482{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006483 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6484 return unicode_compare((PyUnicodeObject *)left,
6485 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006486 PyErr_Format(PyExc_TypeError,
6487 "Can't compare %.100s and %.100s",
6488 left->ob_type->tp_name,
6489 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 return -1;
6491}
6492
Martin v. Löwis5b222132007-06-10 09:51:05 +00006493int
6494PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6495{
6496 int i;
6497 Py_UNICODE *id;
6498 assert(PyUnicode_Check(uni));
6499 id = PyUnicode_AS_UNICODE(uni);
6500 /* Compare Unicode string and source character set string */
6501 for (i = 0; id[i] && str[i]; i++)
6502 if (id[i] != str[i])
6503 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6504 if (id[i])
6505 return 1; /* uni is longer */
6506 if (str[i])
6507 return -1; /* str is longer */
6508 return 0;
6509}
6510
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006511
6512#define TEST_COND(cond) \
6513 ((cond) ? Py_True : Py_False)
6514
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006515PyObject *PyUnicode_RichCompare(PyObject *left,
6516 PyObject *right,
6517 int op)
6518{
6519 int result;
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006520
6521 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6522 PyObject *v;
6523 if (((PyUnicodeObject *) left)->length !=
6524 ((PyUnicodeObject *) right)->length) {
6525 if (op == Py_EQ) {
6526 Py_INCREF(Py_False);
6527 return Py_False;
6528 }
6529 if (op == Py_NE) {
6530 Py_INCREF(Py_True);
6531 return Py_True;
6532 }
6533 }
6534 if (left == right)
6535 result = 0;
6536 else
6537 result = unicode_compare((PyUnicodeObject *)left,
6538 (PyUnicodeObject *)right);
6539
6540 /* Convert the return value to a Boolean */
6541 switch (op) {
6542 case Py_EQ:
6543 v = TEST_COND(result == 0);
6544 break;
6545 case Py_NE:
6546 v = TEST_COND(result != 0);
6547 break;
6548 case Py_LE:
6549 v = TEST_COND(result <= 0);
6550 break;
6551 case Py_GE:
6552 v = TEST_COND(result >= 0);
6553 break;
6554 case Py_LT:
6555 v = TEST_COND(result == -1);
6556 break;
6557 case Py_GT:
6558 v = TEST_COND(result == 1);
6559 break;
6560 default:
6561 PyErr_BadArgument();
6562 return NULL;
6563 }
6564 Py_INCREF(v);
6565 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006566 }
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006567
6568 Py_INCREF(Py_NotImplemented);
6569 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006570}
6571
Guido van Rossum403d68b2000-03-13 15:55:09 +00006572int PyUnicode_Contains(PyObject *container,
6573 PyObject *element)
6574{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006576 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006577
6578 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006579 sub = PyUnicode_FromObject(element);
6580 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006581 PyErr_Format(PyExc_TypeError,
6582 "'in <string>' requires string as left operand, not %s",
6583 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006585 }
6586
Thomas Wouters477c8d52006-05-27 19:21:47 +00006587 str = PyUnicode_FromObject(container);
6588 if (!str) {
6589 Py_DECREF(sub);
6590 return -1;
6591 }
6592
6593 result = stringlib_contains_obj(str, sub);
6594
6595 Py_DECREF(str);
6596 Py_DECREF(sub);
6597
Guido van Rossum403d68b2000-03-13 15:55:09 +00006598 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006599}
6600
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601/* Concat to string or Unicode object giving a new Unicode object. */
6602
6603PyObject *PyUnicode_Concat(PyObject *left,
6604 PyObject *right)
6605{
6606 PyUnicodeObject *u = NULL, *v = NULL, *w;
6607
6608 /* Coerce the two arguments */
6609 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6610 if (u == NULL)
6611 goto onError;
6612 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6613 if (v == NULL)
6614 goto onError;
6615
6616 /* Shortcuts */
6617 if (v == unicode_empty) {
6618 Py_DECREF(v);
6619 return (PyObject *)u;
6620 }
6621 if (u == unicode_empty) {
6622 Py_DECREF(u);
6623 return (PyObject *)v;
6624 }
6625
6626 /* Concat the two Unicode strings */
6627 w = _PyUnicode_New(u->length + v->length);
6628 if (w == NULL)
6629 goto onError;
6630 Py_UNICODE_COPY(w->str, u->str, u->length);
6631 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6632
6633 Py_DECREF(u);
6634 Py_DECREF(v);
6635 return (PyObject *)w;
6636
6637onError:
6638 Py_XDECREF(u);
6639 Py_XDECREF(v);
6640 return NULL;
6641}
6642
Walter Dörwald1ab83302007-05-18 17:15:44 +00006643void
6644PyUnicode_Append(PyObject **pleft, PyObject *right)
6645{
6646 PyObject *new;
6647 if (*pleft == NULL)
6648 return;
6649 if (right == NULL || !PyUnicode_Check(*pleft)) {
6650 Py_DECREF(*pleft);
6651 *pleft = NULL;
6652 return;
6653 }
6654 new = PyUnicode_Concat(*pleft, right);
6655 Py_DECREF(*pleft);
6656 *pleft = new;
6657}
6658
6659void
6660PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6661{
6662 PyUnicode_Append(pleft, right);
6663 Py_XDECREF(right);
6664}
6665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006666PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667"S.count(sub[, start[, end]]) -> int\n\
6668\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006669Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006670string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006671interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673static PyObject *
6674unicode_count(PyUnicodeObject *self, PyObject *args)
6675{
6676 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006677 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006678 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 PyObject *result;
6680
Guido van Rossumb8872e62000-05-09 14:14:27 +00006681 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6682 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 return NULL;
6684
6685 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006686 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 if (substring == NULL)
6688 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006689
Thomas Wouters477c8d52006-05-27 19:21:47 +00006690 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691
Christian Heimes217cfd12007-12-02 14:31:20 +00006692 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006693 stringlib_count(self->str + start, end - start,
6694 substring->str, substring->length)
6695 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696
6697 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006698
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 return result;
6700}
6701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006702PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006703"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006705Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006706to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006707handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6709'xmlcharrefreplace' as well as any other name registered with\n\
6710codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711
6712static PyObject *
6713unicode_encode(PyUnicodeObject *self, PyObject *args)
6714{
6715 char *encoding = NULL;
6716 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006717 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006718
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6720 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006721 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006722 if (v == NULL)
6723 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006724 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006725 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006726 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006727 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006728 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006729 Py_DECREF(v);
6730 return NULL;
6731 }
6732 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006733
6734 onError:
6735 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006736}
6737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006738PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006739"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740\n\
6741Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006742If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743
6744static PyObject*
6745unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6746{
6747 Py_UNICODE *e;
6748 Py_UNICODE *p;
6749 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006750 Py_UNICODE *qe;
6751 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 PyUnicodeObject *u;
6753 int tabsize = 8;
6754
6755 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6756 return NULL;
6757
Thomas Wouters7e474022000-07-16 12:04:32 +00006758 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006759 i = 0; /* chars up to and including most recent \n or \r */
6760 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6761 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 for (p = self->str; p < e; p++)
6763 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006764 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006765 incr = tabsize - (j % tabsize); /* cannot overflow */
6766 if (j > PY_SSIZE_T_MAX - incr)
6767 goto overflow1;
6768 j += incr;
6769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 }
6771 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006772 if (j > PY_SSIZE_T_MAX - 1)
6773 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 j++;
6775 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006776 if (i > PY_SSIZE_T_MAX - j)
6777 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006779 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 }
6781 }
6782
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006783 if (i > PY_SSIZE_T_MAX - j)
6784 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 /* Second pass: create output string and fill it */
6787 u = _PyUnicode_New(i + j);
6788 if (!u)
6789 return NULL;
6790
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006791 j = 0; /* same as in first pass */
6792 q = u->str; /* next output char */
6793 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794
6795 for (p = self->str; p < e; p++)
6796 if (*p == '\t') {
6797 if (tabsize > 0) {
6798 i = tabsize - (j % tabsize);
6799 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006800 while (i--) {
6801 if (q >= qe)
6802 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 }
6806 }
6807 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006808 if (q >= qe)
6809 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006811 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 if (*p == '\n' || *p == '\r')
6813 j = 0;
6814 }
6815
6816 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006817
6818 overflow2:
6819 Py_DECREF(u);
6820 overflow1:
6821 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006825PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006826"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827\n\
6828Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006829such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830arguments start and end are interpreted as in slice notation.\n\
6831\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006832Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833
6834static PyObject *
6835unicode_find(PyUnicodeObject *self, PyObject *args)
6836{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006837 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006838 Py_ssize_t start;
6839 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006840 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
Christian Heimes9cd17752007-11-18 19:35:23 +00006842 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Thomas Wouters477c8d52006-05-27 19:21:47 +00006845 result = stringlib_find_slice(
6846 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6847 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6848 start, end
6849 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
6851 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006852
Christian Heimes217cfd12007-12-02 14:31:20 +00006853 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854}
6855
6856static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006857unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858{
6859 if (index < 0 || index >= self->length) {
6860 PyErr_SetString(PyExc_IndexError, "string index out of range");
6861 return NULL;
6862 }
6863
6864 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6865}
6866
Guido van Rossumc2504932007-09-18 19:42:40 +00006867/* Believe it or not, this produces the same value for ASCII strings
6868 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006870unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871{
Guido van Rossumc2504932007-09-18 19:42:40 +00006872 Py_ssize_t len;
6873 Py_UNICODE *p;
6874 long x;
6875
6876 if (self->hash != -1)
6877 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00006878 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006879 p = self->str;
6880 x = *p << 7;
6881 while (--len >= 0)
6882 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00006883 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00006884 if (x == -1)
6885 x = -2;
6886 self->hash = x;
6887 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888}
6889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006891"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006893Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894
6895static PyObject *
6896unicode_index(PyUnicodeObject *self, PyObject *args)
6897{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006898 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006899 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006900 Py_ssize_t start;
6901 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902
Christian Heimes9cd17752007-11-18 19:35:23 +00006903 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905
Thomas Wouters477c8d52006-05-27 19:21:47 +00006906 result = stringlib_find_slice(
6907 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6908 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6909 start, end
6910 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
6912 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006913
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 if (result < 0) {
6915 PyErr_SetString(PyExc_ValueError, "substring not found");
6916 return NULL;
6917 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006918
Christian Heimes217cfd12007-12-02 14:31:20 +00006919 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920}
6921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006922PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006923"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006925Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006926at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
6928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006929unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930{
6931 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6932 register const Py_UNICODE *e;
6933 int cased;
6934
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 /* Shortcut for single character strings */
6936 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006937 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006939 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006940 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 e = p + PyUnicode_GET_SIZE(self);
6944 cased = 0;
6945 for (; p < e; p++) {
6946 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006947
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006949 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 else if (!cased && Py_UNICODE_ISLOWER(ch))
6951 cased = 1;
6952 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006953 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954}
6955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006956PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006959Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006960at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961
6962static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006963unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964{
6965 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6966 register const Py_UNICODE *e;
6967 int cased;
6968
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 /* Shortcut for single character strings */
6970 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006971 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006973 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006974 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006975 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006976
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 e = p + PyUnicode_GET_SIZE(self);
6978 cased = 0;
6979 for (; p < e; p++) {
6980 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006983 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 else if (!cased && Py_UNICODE_ISUPPER(ch))
6985 cased = 1;
6986 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006987 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988}
6989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006990PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006991"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006993Return True if S is a titlecased string and there is at least one\n\
6994character in S, i.e. upper- and titlecase characters may only\n\
6995follow uncased characters and lowercase characters only cased ones.\n\
6996Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997
6998static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006999unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000{
7001 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7002 register const Py_UNICODE *e;
7003 int cased, previous_is_cased;
7004
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 /* Shortcut for single character strings */
7006 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007007 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7008 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007010 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007011 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007012 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007013
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 e = p + PyUnicode_GET_SIZE(self);
7015 cased = 0;
7016 previous_is_cased = 0;
7017 for (; p < e; p++) {
7018 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007019
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7021 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007022 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 previous_is_cased = 1;
7024 cased = 1;
7025 }
7026 else if (Py_UNICODE_ISLOWER(ch)) {
7027 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007028 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 previous_is_cased = 1;
7030 cased = 1;
7031 }
7032 else
7033 previous_is_cased = 0;
7034 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007035 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036}
7037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007038PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007039"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007041Return True if all characters in S are whitespace\n\
7042and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043
7044static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007045unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046{
7047 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7048 register const Py_UNICODE *e;
7049
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 /* Shortcut for single character strings */
7051 if (PyUnicode_GET_SIZE(self) == 1 &&
7052 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007053 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007055 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007056 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007057 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007058
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 e = p + PyUnicode_GET_SIZE(self);
7060 for (; p < e; p++) {
7061 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007062 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007064 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065}
7066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007067PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007068"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007069\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007070Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007071and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007072
7073static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007074unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007075{
7076 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7077 register const Py_UNICODE *e;
7078
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007079 /* Shortcut for single character strings */
7080 if (PyUnicode_GET_SIZE(self) == 1 &&
7081 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007082 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007083
7084 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007085 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007086 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007087
7088 e = p + PyUnicode_GET_SIZE(self);
7089 for (; p < e; p++) {
7090 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007091 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007092 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007093 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007094}
7095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007096PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007097"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007098\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007099Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007100and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007101
7102static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007103unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007104{
7105 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7106 register const Py_UNICODE *e;
7107
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007108 /* Shortcut for single character strings */
7109 if (PyUnicode_GET_SIZE(self) == 1 &&
7110 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007111 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007112
7113 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007114 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007115 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007116
7117 e = p + PyUnicode_GET_SIZE(self);
7118 for (; p < e; p++) {
7119 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007120 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007121 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007122 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007123}
7124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007125PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007126"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007128Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007129False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130
7131static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007132unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133{
7134 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7135 register const Py_UNICODE *e;
7136
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 /* Shortcut for single character strings */
7138 if (PyUnicode_GET_SIZE(self) == 1 &&
7139 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007140 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007142 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007143 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007144 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007145
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 e = p + PyUnicode_GET_SIZE(self);
7147 for (; p < e; p++) {
7148 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007149 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007151 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152}
7153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007154PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007155"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007157Return True if all characters in S are digits\n\
7158and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159
7160static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007161unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162{
7163 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7164 register const Py_UNICODE *e;
7165
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 /* Shortcut for single character strings */
7167 if (PyUnicode_GET_SIZE(self) == 1 &&
7168 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007169 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007171 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007172 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007173 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007174
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 e = p + PyUnicode_GET_SIZE(self);
7176 for (; p < e; p++) {
7177 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007178 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007180 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181}
7182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007183PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007184"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007186Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007187False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188
7189static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007190unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191{
7192 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7193 register const Py_UNICODE *e;
7194
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 /* Shortcut for single character strings */
7196 if (PyUnicode_GET_SIZE(self) == 1 &&
7197 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007198 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007200 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007201 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007202 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007203
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 e = p + PyUnicode_GET_SIZE(self);
7205 for (; p < e; p++) {
7206 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007207 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007209 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210}
7211
Martin v. Löwis47383402007-08-15 07:32:56 +00007212int
7213PyUnicode_IsIdentifier(PyObject *self)
7214{
7215 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7216 register const Py_UNICODE *e;
7217
7218 /* Special case for empty strings */
7219 if (PyUnicode_GET_SIZE(self) == 0)
7220 return 0;
7221
7222 /* PEP 3131 says that the first character must be in
7223 XID_Start and subsequent characters in XID_Continue,
7224 and for the ASCII range, the 2.x rules apply (i.e
7225 start with letters and underscore, continue with
7226 letters, digits, underscore). However, given the current
7227 definition of XID_Start and XID_Continue, it is sufficient
7228 to check just for these, except that _ must be allowed
7229 as starting an identifier. */
7230 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7231 return 0;
7232
7233 e = p + PyUnicode_GET_SIZE(self);
7234 for (p++; p < e; p++) {
7235 if (!_PyUnicode_IsXidContinue(*p))
7236 return 0;
7237 }
7238 return 1;
7239}
7240
7241PyDoc_STRVAR(isidentifier__doc__,
7242"S.isidentifier() -> bool\n\
7243\n\
7244Return True if S is a valid identifier according\n\
7245to the language definition.");
7246
7247static PyObject*
7248unicode_isidentifier(PyObject *self)
7249{
7250 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7251}
7252
Georg Brandl559e5d72008-06-11 18:37:52 +00007253PyDoc_STRVAR(isprintable__doc__,
7254"S.isprintable() -> bool\n\
7255\n\
7256Return True if all characters in S are considered\n\
7257printable in repr() or S is empty, False otherwise.");
7258
7259static PyObject*
7260unicode_isprintable(PyObject *self)
7261{
7262 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7263 register const Py_UNICODE *e;
7264
7265 /* Shortcut for single character strings */
7266 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7267 Py_RETURN_TRUE;
7268 }
7269
7270 e = p + PyUnicode_GET_SIZE(self);
7271 for (; p < e; p++) {
7272 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7273 Py_RETURN_FALSE;
7274 }
7275 }
7276 Py_RETURN_TRUE;
7277}
7278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007279PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007280"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281\n\
7282Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007283sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
7285static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007286unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007288 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289}
7290
Martin v. Löwis18e16552006-02-15 17:27:45 +00007291static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292unicode_length(PyUnicodeObject *self)
7293{
7294 return self->length;
7295}
7296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007297PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007298"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007300Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007301done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302
7303static PyObject *
7304unicode_ljust(PyUnicodeObject *self, PyObject *args)
7305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007306 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007307 Py_UNICODE fillchar = ' ';
7308
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007309 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 return NULL;
7311
Tim Peters7a29bd52001-09-12 03:03:31 +00007312 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 Py_INCREF(self);
7314 return (PyObject*) self;
7315 }
7316
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007317 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318}
7319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007320PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007321"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007323Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324
7325static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007326unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 return fixup(self, fixlower);
7329}
7330
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007331#define LEFTSTRIP 0
7332#define RIGHTSTRIP 1
7333#define BOTHSTRIP 2
7334
7335/* Arrays indexed by above */
7336static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7337
7338#define STRIPNAME(i) (stripformat[i]+3)
7339
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007340/* externally visible for str.strip(unicode) */
7341PyObject *
7342_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7343{
7344 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007345 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007346 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007347 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7348 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007349
Thomas Wouters477c8d52006-05-27 19:21:47 +00007350 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7351
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007352 i = 0;
7353 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007354 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7355 i++;
7356 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007357 }
7358
7359 j = len;
7360 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007361 do {
7362 j--;
7363 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7364 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007365 }
7366
7367 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007368 Py_INCREF(self);
7369 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007370 }
7371 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007372 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007373}
7374
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375
7376static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007377do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007379 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007380 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007381
7382 i = 0;
7383 if (striptype != RIGHTSTRIP) {
7384 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7385 i++;
7386 }
7387 }
7388
7389 j = len;
7390 if (striptype != LEFTSTRIP) {
7391 do {
7392 j--;
7393 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7394 j++;
7395 }
7396
7397 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7398 Py_INCREF(self);
7399 return (PyObject*)self;
7400 }
7401 else
7402 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403}
7404
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007405
7406static PyObject *
7407do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7408{
7409 PyObject *sep = NULL;
7410
7411 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7412 return NULL;
7413
7414 if (sep != NULL && sep != Py_None) {
7415 if (PyUnicode_Check(sep))
7416 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007417 else {
7418 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007419 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007420 STRIPNAME(striptype));
7421 return NULL;
7422 }
7423 }
7424
7425 return do_strip(self, striptype);
7426}
7427
7428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007429PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007430"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007431\n\
7432Return a copy of the string S with leading and trailing\n\
7433whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007434If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007435
7436static PyObject *
7437unicode_strip(PyUnicodeObject *self, PyObject *args)
7438{
7439 if (PyTuple_GET_SIZE(args) == 0)
7440 return do_strip(self, BOTHSTRIP); /* Common case */
7441 else
7442 return do_argstrip(self, BOTHSTRIP, args);
7443}
7444
7445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007446PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007447"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007448\n\
7449Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007450If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007451
7452static PyObject *
7453unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7454{
7455 if (PyTuple_GET_SIZE(args) == 0)
7456 return do_strip(self, LEFTSTRIP); /* Common case */
7457 else
7458 return do_argstrip(self, LEFTSTRIP, args);
7459}
7460
7461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007462PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007463"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007464\n\
7465Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007466If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007467
7468static PyObject *
7469unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7470{
7471 if (PyTuple_GET_SIZE(args) == 0)
7472 return do_strip(self, RIGHTSTRIP); /* Common case */
7473 else
7474 return do_argstrip(self, RIGHTSTRIP, args);
7475}
7476
7477
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007479unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480{
7481 PyUnicodeObject *u;
7482 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007483 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007484 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485
7486 if (len < 0)
7487 len = 0;
7488
Tim Peters7a29bd52001-09-12 03:03:31 +00007489 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 /* no repeat, return original string */
7491 Py_INCREF(str);
7492 return (PyObject*) str;
7493 }
Tim Peters8f422462000-09-09 06:13:41 +00007494
7495 /* ensure # of chars needed doesn't overflow int and # of bytes
7496 * needed doesn't overflow size_t
7497 */
7498 nchars = len * str->length;
7499 if (len && nchars / len != str->length) {
7500 PyErr_SetString(PyExc_OverflowError,
7501 "repeated string is too long");
7502 return NULL;
7503 }
7504 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7505 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7506 PyErr_SetString(PyExc_OverflowError,
7507 "repeated string is too long");
7508 return NULL;
7509 }
7510 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 if (!u)
7512 return NULL;
7513
7514 p = u->str;
7515
Thomas Wouters477c8d52006-05-27 19:21:47 +00007516 if (str->length == 1 && len > 0) {
7517 Py_UNICODE_FILL(p, str->str[0], len);
7518 } else {
7519 Py_ssize_t done = 0; /* number of characters copied this far */
7520 if (done < nchars) {
7521 Py_UNICODE_COPY(p, str->str, str->length);
7522 done = str->length;
7523 }
7524 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007525 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007526 Py_UNICODE_COPY(p+done, p, n);
7527 done += n;
7528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 }
7530
7531 return (PyObject*) u;
7532}
7533
7534PyObject *PyUnicode_Replace(PyObject *obj,
7535 PyObject *subobj,
7536 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007537 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538{
7539 PyObject *self;
7540 PyObject *str1;
7541 PyObject *str2;
7542 PyObject *result;
7543
7544 self = PyUnicode_FromObject(obj);
7545 if (self == NULL)
7546 return NULL;
7547 str1 = PyUnicode_FromObject(subobj);
7548 if (str1 == NULL) {
7549 Py_DECREF(self);
7550 return NULL;
7551 }
7552 str2 = PyUnicode_FromObject(replobj);
7553 if (str2 == NULL) {
7554 Py_DECREF(self);
7555 Py_DECREF(str1);
7556 return NULL;
7557 }
Tim Petersced69f82003-09-16 20:30:58 +00007558 result = replace((PyUnicodeObject *)self,
7559 (PyUnicodeObject *)str1,
7560 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561 maxcount);
7562 Py_DECREF(self);
7563 Py_DECREF(str1);
7564 Py_DECREF(str2);
7565 return result;
7566}
7567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007568PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007569"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570\n\
7571Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007572old replaced by new. If the optional argument count is\n\
7573given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574
7575static PyObject*
7576unicode_replace(PyUnicodeObject *self, PyObject *args)
7577{
7578 PyUnicodeObject *str1;
7579 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 PyObject *result;
7582
Martin v. Löwis18e16552006-02-15 17:27:45 +00007583 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 return NULL;
7585 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7586 if (str1 == NULL)
7587 return NULL;
7588 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007589 if (str2 == NULL) {
7590 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593
7594 result = replace(self, str1, str2, maxcount);
7595
7596 Py_DECREF(str1);
7597 Py_DECREF(str2);
7598 return result;
7599}
7600
7601static
7602PyObject *unicode_repr(PyObject *unicode)
7603{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007604 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007605 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007606 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7607 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7608
7609 /* XXX(nnorwitz): rather than over-allocating, it would be
7610 better to choose a different scheme. Perhaps scan the
7611 first N-chars of the string and allocate based on that size.
7612 */
7613 /* Initial allocation is based on the longest-possible unichr
7614 escape.
7615
7616 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7617 unichr, so in this case it's the longest unichr escape. In
7618 narrow (UTF-16) builds this is five chars per source unichr
7619 since there are two unichrs in the surrogate pair, so in narrow
7620 (UTF-16) builds it's not the longest unichr escape.
7621
7622 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7623 so in the narrow (UTF-16) build case it's the longest unichr
7624 escape.
7625 */
7626
Walter Dörwald1ab83302007-05-18 17:15:44 +00007627 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007628 2 /* quotes */
7629#ifdef Py_UNICODE_WIDE
7630 + 10*size
7631#else
7632 + 6*size
7633#endif
7634 + 1);
7635 if (repr == NULL)
7636 return NULL;
7637
Walter Dörwald1ab83302007-05-18 17:15:44 +00007638 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007639
7640 /* Add quote */
7641 *p++ = (findchar(s, size, '\'') &&
7642 !findchar(s, size, '"')) ? '"' : '\'';
7643 while (size-- > 0) {
7644 Py_UNICODE ch = *s++;
7645
7646 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007647 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007648 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007649 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007650 continue;
7651 }
7652
Georg Brandl559e5d72008-06-11 18:37:52 +00007653 /* Map special whitespace to '\t', \n', '\r' */
7654 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007655 *p++ = '\\';
7656 *p++ = 't';
7657 }
7658 else if (ch == '\n') {
7659 *p++ = '\\';
7660 *p++ = 'n';
7661 }
7662 else if (ch == '\r') {
7663 *p++ = '\\';
7664 *p++ = 'r';
7665 }
7666
7667 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007668 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007669 *p++ = '\\';
7670 *p++ = 'x';
7671 *p++ = hexdigits[(ch >> 4) & 0x000F];
7672 *p++ = hexdigits[ch & 0x000F];
7673 }
7674
Georg Brandl559e5d72008-06-11 18:37:52 +00007675 /* Copy ASCII characters as-is */
7676 else if (ch < 0x7F) {
7677 *p++ = ch;
7678 }
7679
7680 /* Non-ASCII characters */
7681 else {
7682 Py_UCS4 ucs = ch;
7683
7684#ifndef Py_UNICODE_WIDE
7685 Py_UNICODE ch2 = 0;
7686 /* Get code point from surrogate pair */
7687 if (size > 0) {
7688 ch2 = *s;
7689 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7690 && ch2 <= 0xDFFF) {
7691 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7692 + 0x00010000;
7693 s++;
7694 size--;
7695 }
7696 }
7697#endif
7698 /* Map Unicode whitespace and control characters
7699 (categories Z* and C* except ASCII space)
7700 */
7701 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7702 /* Map 8-bit characters to '\xhh' */
7703 if (ucs <= 0xff) {
7704 *p++ = '\\';
7705 *p++ = 'x';
7706 *p++ = hexdigits[(ch >> 4) & 0x000F];
7707 *p++ = hexdigits[ch & 0x000F];
7708 }
7709 /* Map 21-bit characters to '\U00xxxxxx' */
7710 else if (ucs >= 0x10000) {
7711 *p++ = '\\';
7712 *p++ = 'U';
7713 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7714 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7715 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7716 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7717 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7718 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7719 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7720 *p++ = hexdigits[ucs & 0x0000000F];
7721 }
7722 /* Map 16-bit characters to '\uxxxx' */
7723 else {
7724 *p++ = '\\';
7725 *p++ = 'u';
7726 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7727 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7728 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7729 *p++ = hexdigits[ucs & 0x000F];
7730 }
7731 }
7732 /* Copy characters as-is */
7733 else {
7734 *p++ = ch;
7735#ifndef Py_UNICODE_WIDE
7736 if (ucs >= 0x10000)
7737 *p++ = ch2;
7738#endif
7739 }
7740 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007741 }
7742 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007743 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007744
7745 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007746 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007747 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748}
7749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007750PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007751"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752\n\
7753Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007754such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755arguments start and end are interpreted as in slice notation.\n\
7756\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007757Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758
7759static PyObject *
7760unicode_rfind(PyUnicodeObject *self, PyObject *args)
7761{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007762 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007763 Py_ssize_t start;
7764 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007765 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766
Christian Heimes9cd17752007-11-18 19:35:23 +00007767 if (!_ParseTupleFinds(args, &substring, &start, &end))
7768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769
Thomas Wouters477c8d52006-05-27 19:21:47 +00007770 result = stringlib_rfind_slice(
7771 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7772 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7773 start, end
7774 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775
7776 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007777
Christian Heimes217cfd12007-12-02 14:31:20 +00007778 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779}
7780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007781PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007782"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007784Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785
7786static PyObject *
7787unicode_rindex(PyUnicodeObject *self, PyObject *args)
7788{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007789 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007790 Py_ssize_t start;
7791 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007792 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793
Christian Heimes9cd17752007-11-18 19:35:23 +00007794 if (!_ParseTupleFinds(args, &substring, &start, &end))
7795 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796
Thomas Wouters477c8d52006-05-27 19:21:47 +00007797 result = stringlib_rfind_slice(
7798 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7799 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7800 start, end
7801 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802
7803 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007804
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 if (result < 0) {
7806 PyErr_SetString(PyExc_ValueError, "substring not found");
7807 return NULL;
7808 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007809 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810}
7811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007812PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007813"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007815Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007816done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817
7818static PyObject *
7819unicode_rjust(PyUnicodeObject *self, PyObject *args)
7820{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007821 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007822 Py_UNICODE fillchar = ' ';
7823
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007824 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 return NULL;
7826
Tim Peters7a29bd52001-09-12 03:03:31 +00007827 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 Py_INCREF(self);
7829 return (PyObject*) self;
7830 }
7831
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007832 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833}
7834
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835PyObject *PyUnicode_Split(PyObject *s,
7836 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007837 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838{
7839 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007840
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 s = PyUnicode_FromObject(s);
7842 if (s == NULL)
7843 return NULL;
7844 if (sep != NULL) {
7845 sep = PyUnicode_FromObject(sep);
7846 if (sep == NULL) {
7847 Py_DECREF(s);
7848 return NULL;
7849 }
7850 }
7851
7852 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7853
7854 Py_DECREF(s);
7855 Py_XDECREF(sep);
7856 return result;
7857}
7858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007859PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007860"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861\n\
7862Return a list of the words in S, using sep as the\n\
7863delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00007864splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00007865whitespace string is a separator and empty strings are\n\
7866removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867
7868static PyObject*
7869unicode_split(PyUnicodeObject *self, PyObject *args)
7870{
7871 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007872 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873
Martin v. Löwis18e16552006-02-15 17:27:45 +00007874 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 return NULL;
7876
7877 if (substring == Py_None)
7878 return split(self, NULL, maxcount);
7879 else if (PyUnicode_Check(substring))
7880 return split(self, (PyUnicodeObject *)substring, maxcount);
7881 else
7882 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7883}
7884
Thomas Wouters477c8d52006-05-27 19:21:47 +00007885PyObject *
7886PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7887{
7888 PyObject* str_obj;
7889 PyObject* sep_obj;
7890 PyObject* out;
7891
7892 str_obj = PyUnicode_FromObject(str_in);
7893 if (!str_obj)
7894 return NULL;
7895 sep_obj = PyUnicode_FromObject(sep_in);
7896 if (!sep_obj) {
7897 Py_DECREF(str_obj);
7898 return NULL;
7899 }
7900
7901 out = stringlib_partition(
7902 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7903 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7904 );
7905
7906 Py_DECREF(sep_obj);
7907 Py_DECREF(str_obj);
7908
7909 return out;
7910}
7911
7912
7913PyObject *
7914PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7915{
7916 PyObject* str_obj;
7917 PyObject* sep_obj;
7918 PyObject* out;
7919
7920 str_obj = PyUnicode_FromObject(str_in);
7921 if (!str_obj)
7922 return NULL;
7923 sep_obj = PyUnicode_FromObject(sep_in);
7924 if (!sep_obj) {
7925 Py_DECREF(str_obj);
7926 return NULL;
7927 }
7928
7929 out = stringlib_rpartition(
7930 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7931 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7932 );
7933
7934 Py_DECREF(sep_obj);
7935 Py_DECREF(str_obj);
7936
7937 return out;
7938}
7939
7940PyDoc_STRVAR(partition__doc__,
7941"S.partition(sep) -> (head, sep, tail)\n\
7942\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007943Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007944the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007945found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007946
7947static PyObject*
7948unicode_partition(PyUnicodeObject *self, PyObject *separator)
7949{
7950 return PyUnicode_Partition((PyObject *)self, separator);
7951}
7952
7953PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007954"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007955\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007956Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007957the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007958separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007959
7960static PyObject*
7961unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7962{
7963 return PyUnicode_RPartition((PyObject *)self, separator);
7964}
7965
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007966PyObject *PyUnicode_RSplit(PyObject *s,
7967 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007968 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007969{
7970 PyObject *result;
7971
7972 s = PyUnicode_FromObject(s);
7973 if (s == NULL)
7974 return NULL;
7975 if (sep != NULL) {
7976 sep = PyUnicode_FromObject(sep);
7977 if (sep == NULL) {
7978 Py_DECREF(s);
7979 return NULL;
7980 }
7981 }
7982
7983 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7984
7985 Py_DECREF(s);
7986 Py_XDECREF(sep);
7987 return result;
7988}
7989
7990PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007991"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007992\n\
7993Return a list of the words in S, using sep as the\n\
7994delimiter string, starting at the end of the string and\n\
7995working to the front. If maxsplit is given, at most maxsplit\n\
7996splits are done. If sep is not specified, any whitespace string\n\
7997is a separator.");
7998
7999static PyObject*
8000unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8001{
8002 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008003 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008004
Martin v. Löwis18e16552006-02-15 17:27:45 +00008005 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008006 return NULL;
8007
8008 if (substring == Py_None)
8009 return rsplit(self, NULL, maxcount);
8010 else if (PyUnicode_Check(substring))
8011 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8012 else
8013 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8014}
8015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008016PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson4469d0c2008-11-30 22:46:23 +00008017"S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018\n\
8019Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008020Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008021is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022
8023static PyObject*
8024unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8025{
Guido van Rossum86662912000-04-11 15:38:46 +00008026 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027
Guido van Rossum86662912000-04-11 15:38:46 +00008028 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 return NULL;
8030
Guido van Rossum86662912000-04-11 15:38:46 +00008031 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032}
8033
8034static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008035PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036{
Walter Dörwald346737f2007-05-31 10:44:43 +00008037 if (PyUnicode_CheckExact(self)) {
8038 Py_INCREF(self);
8039 return self;
8040 } else
8041 /* Subtype -- return genuine unicode string with the same value. */
8042 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8043 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044}
8045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008046PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008047"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048\n\
8049Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008050and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051
8052static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008053unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 return fixup(self, fixswapcase);
8056}
8057
Georg Brandlceee0772007-11-27 23:48:05 +00008058PyDoc_STRVAR(maketrans__doc__,
8059"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8060\n\
8061Return a translation table usable for str.translate().\n\
8062If there is only one argument, it must be a dictionary mapping Unicode\n\
8063ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008064Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008065If there are two arguments, they must be strings of equal length, and\n\
8066in the resulting dictionary, each character in x will be mapped to the\n\
8067character at the same position in y. If there is a third argument, it\n\
8068must be a string, whose characters will be mapped to None in the result.");
8069
8070static PyObject*
8071unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8072{
8073 PyObject *x, *y = NULL, *z = NULL;
8074 PyObject *new = NULL, *key, *value;
8075 Py_ssize_t i = 0;
8076 int res;
8077
8078 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8079 return NULL;
8080 new = PyDict_New();
8081 if (!new)
8082 return NULL;
8083 if (y != NULL) {
8084 /* x must be a string too, of equal length */
8085 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8086 if (!PyUnicode_Check(x)) {
8087 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8088 "be a string if there is a second argument");
8089 goto err;
8090 }
8091 if (PyUnicode_GET_SIZE(x) != ylen) {
8092 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8093 "arguments must have equal length");
8094 goto err;
8095 }
8096 /* create entries for translating chars in x to those in y */
8097 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008098 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8099 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008100 if (!key || !value)
8101 goto err;
8102 res = PyDict_SetItem(new, key, value);
8103 Py_DECREF(key);
8104 Py_DECREF(value);
8105 if (res < 0)
8106 goto err;
8107 }
8108 /* create entries for deleting chars in z */
8109 if (z != NULL) {
8110 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008111 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008112 if (!key)
8113 goto err;
8114 res = PyDict_SetItem(new, key, Py_None);
8115 Py_DECREF(key);
8116 if (res < 0)
8117 goto err;
8118 }
8119 }
8120 } else {
8121 /* x must be a dict */
8122 if (!PyDict_Check(x)) {
8123 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8124 "to maketrans it must be a dict");
8125 goto err;
8126 }
8127 /* copy entries into the new dict, converting string keys to int keys */
8128 while (PyDict_Next(x, &i, &key, &value)) {
8129 if (PyUnicode_Check(key)) {
8130 /* convert string keys to integer keys */
8131 PyObject *newkey;
8132 if (PyUnicode_GET_SIZE(key) != 1) {
8133 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8134 "table must be of length 1");
8135 goto err;
8136 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008137 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008138 if (!newkey)
8139 goto err;
8140 res = PyDict_SetItem(new, newkey, value);
8141 Py_DECREF(newkey);
8142 if (res < 0)
8143 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008144 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008145 /* just keep integer keys */
8146 if (PyDict_SetItem(new, key, value) < 0)
8147 goto err;
8148 } else {
8149 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8150 "be strings or integers");
8151 goto err;
8152 }
8153 }
8154 }
8155 return new;
8156 err:
8157 Py_DECREF(new);
8158 return NULL;
8159}
8160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008161PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008162"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163\n\
8164Return a copy of the string S, where all characters have been mapped\n\
8165through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008166Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008167Unmapped characters are left untouched. Characters mapped to None\n\
8168are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
8170static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008171unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172{
Georg Brandlceee0772007-11-27 23:48:05 +00008173 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174}
8175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008176PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008177"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008179Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180
8181static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008182unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184 return fixup(self, fixupper);
8185}
8186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008187PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008188"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008190Pad a numeric string S with zeros on the left, to fill a field\n\
8191of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192
8193static PyObject *
8194unicode_zfill(PyUnicodeObject *self, PyObject *args)
8195{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008196 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 PyUnicodeObject *u;
8198
Martin v. Löwis18e16552006-02-15 17:27:45 +00008199 Py_ssize_t width;
8200 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 return NULL;
8202
8203 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008204 if (PyUnicode_CheckExact(self)) {
8205 Py_INCREF(self);
8206 return (PyObject*) self;
8207 }
8208 else
8209 return PyUnicode_FromUnicode(
8210 PyUnicode_AS_UNICODE(self),
8211 PyUnicode_GET_SIZE(self)
8212 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 }
8214
8215 fill = width - self->length;
8216
8217 u = pad(self, fill, 0, '0');
8218
Walter Dörwald068325e2002-04-15 13:36:47 +00008219 if (u == NULL)
8220 return NULL;
8221
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 if (u->str[fill] == '+' || u->str[fill] == '-') {
8223 /* move sign to beginning of string */
8224 u->str[0] = u->str[fill];
8225 u->str[fill] = '0';
8226 }
8227
8228 return (PyObject*) u;
8229}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230
8231#if 0
8232static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008233unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234{
Christian Heimes2202f872008-02-06 14:31:34 +00008235 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236}
8237#endif
8238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008239PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008240"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008242Return True if S starts with the specified prefix, False otherwise.\n\
8243With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008244With optional end, stop comparing S at that position.\n\
8245prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246
8247static PyObject *
8248unicode_startswith(PyUnicodeObject *self,
8249 PyObject *args)
8250{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008251 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008253 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008254 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008255 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008257 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008258 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008260 if (PyTuple_Check(subobj)) {
8261 Py_ssize_t i;
8262 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8263 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8264 PyTuple_GET_ITEM(subobj, i));
8265 if (substring == NULL)
8266 return NULL;
8267 result = tailmatch(self, substring, start, end, -1);
8268 Py_DECREF(substring);
8269 if (result) {
8270 Py_RETURN_TRUE;
8271 }
8272 }
8273 /* nothing matched */
8274 Py_RETURN_FALSE;
8275 }
8276 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008278 return NULL;
8279 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008281 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282}
8283
8284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008285PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008286"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008288Return True if S ends with the specified suffix, False otherwise.\n\
8289With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008290With optional end, stop comparing S at that position.\n\
8291suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292
8293static PyObject *
8294unicode_endswith(PyUnicodeObject *self,
8295 PyObject *args)
8296{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008297 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008299 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008300 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008301 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008303 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8304 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008306 if (PyTuple_Check(subobj)) {
8307 Py_ssize_t i;
8308 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8309 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8310 PyTuple_GET_ITEM(subobj, i));
8311 if (substring == NULL)
8312 return NULL;
8313 result = tailmatch(self, substring, start, end, +1);
8314 Py_DECREF(substring);
8315 if (result) {
8316 Py_RETURN_TRUE;
8317 }
8318 }
8319 Py_RETURN_FALSE;
8320 }
8321 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008325 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008327 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328}
8329
Eric Smith8c663262007-08-25 02:26:07 +00008330#include "stringlib/string_format.h"
8331
8332PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008333"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008334\n\
8335");
8336
Eric Smith4a7d76d2008-05-30 18:10:19 +00008337static PyObject *
8338unicode__format__(PyObject* self, PyObject* args)
8339{
8340 PyObject *format_spec;
8341
8342 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8343 return NULL;
8344
8345 return _PyUnicode_FormatAdvanced(self,
8346 PyUnicode_AS_UNICODE(format_spec),
8347 PyUnicode_GET_SIZE(format_spec));
8348}
8349
Eric Smith8c663262007-08-25 02:26:07 +00008350PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008351"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008352\n\
8353");
8354
8355static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008356unicode__sizeof__(PyUnicodeObject *v)
8357{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008358 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8359 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008360}
8361
8362PyDoc_STRVAR(sizeof__doc__,
8363"S.__sizeof__() -> size of S in memory, in bytes");
8364
8365static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008366unicode_getnewargs(PyUnicodeObject *v)
8367{
8368 return Py_BuildValue("(u#)", v->str, v->length);
8369}
8370
8371
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372static PyMethodDef unicode_methods[] = {
8373
8374 /* Order is according to common usage: often used methods should
8375 appear first, since lookup is done sequentially. */
8376
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008377 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8378 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8379 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008380 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008381 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8382 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8383 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8384 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8385 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8386 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8387 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008388 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008389 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8390 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8391 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008392 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008393 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8394 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8395 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008396 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008397 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008398 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008399 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008400 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8401 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8402 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8403 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8404 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8405 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8406 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8407 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8408 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8409 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8410 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8411 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8412 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8413 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008414 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008415 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008416 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008417 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008418 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008419 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8420 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008421 {"maketrans", (PyCFunction) unicode_maketrans,
8422 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008423 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008424#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008425 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426#endif
8427
8428#if 0
8429 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008430 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431#endif
8432
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008433 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 {NULL, NULL}
8435};
8436
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008437static PyObject *
8438unicode_mod(PyObject *v, PyObject *w)
8439{
8440 if (!PyUnicode_Check(v)) {
8441 Py_INCREF(Py_NotImplemented);
8442 return Py_NotImplemented;
8443 }
8444 return PyUnicode_Format(v, w);
8445}
8446
8447static PyNumberMethods unicode_as_number = {
8448 0, /*nb_add*/
8449 0, /*nb_subtract*/
8450 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008451 unicode_mod, /*nb_remainder*/
8452};
8453
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008455 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008456 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008457 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8458 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008459 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460 0, /* sq_ass_item */
8461 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008462 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463};
8464
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008465static PyObject*
8466unicode_subscript(PyUnicodeObject* self, PyObject* item)
8467{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008468 if (PyIndex_Check(item)) {
8469 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008470 if (i == -1 && PyErr_Occurred())
8471 return NULL;
8472 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008473 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008474 return unicode_getitem(self, i);
8475 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008476 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008477 Py_UNICODE* source_buf;
8478 Py_UNICODE* result_buf;
8479 PyObject* result;
8480
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008481 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008482 &start, &stop, &step, &slicelength) < 0) {
8483 return NULL;
8484 }
8485
8486 if (slicelength <= 0) {
8487 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008488 } else if (start == 0 && step == 1 && slicelength == self->length &&
8489 PyUnicode_CheckExact(self)) {
8490 Py_INCREF(self);
8491 return (PyObject *)self;
8492 } else if (step == 1) {
8493 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008494 } else {
8495 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008496 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8497 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008498
8499 if (result_buf == NULL)
8500 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008501
8502 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8503 result_buf[i] = source_buf[cur];
8504 }
Tim Petersced69f82003-09-16 20:30:58 +00008505
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008506 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008507 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008508 return result;
8509 }
8510 } else {
8511 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8512 return NULL;
8513 }
8514}
8515
8516static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008517 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008518 (binaryfunc)unicode_subscript, /* mp_subscript */
8519 (objobjargproc)0, /* mp_ass_subscript */
8520};
8521
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523/* Helpers for PyUnicode_Format() */
8524
8525static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008526getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008528 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 if (argidx < arglen) {
8530 (*p_argidx)++;
8531 if (arglen < 0)
8532 return args;
8533 else
8534 return PyTuple_GetItem(args, argidx);
8535 }
8536 PyErr_SetString(PyExc_TypeError,
8537 "not enough arguments for format string");
8538 return NULL;
8539}
8540
Martin v. Löwis18e16552006-02-15 17:27:45 +00008541static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008542strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008544 register Py_ssize_t i;
8545 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546 for (i = len - 1; i >= 0; i--)
8547 buffer[i] = (Py_UNICODE) charbuffer[i];
8548
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 return len;
8550}
8551
Neal Norwitzfc76d632006-01-10 06:03:13 +00008552static int
8553doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8554{
Tim Peters15231542006-02-16 01:08:01 +00008555 Py_ssize_t result;
8556
Neal Norwitzfc76d632006-01-10 06:03:13 +00008557 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008558 result = strtounicode(buffer, (char *)buffer);
8559 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008560}
8561
Christian Heimes3fd13992008-03-21 01:05:49 +00008562#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008563static int
8564longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8565{
Tim Peters15231542006-02-16 01:08:01 +00008566 Py_ssize_t result;
8567
Neal Norwitzfc76d632006-01-10 06:03:13 +00008568 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008569 result = strtounicode(buffer, (char *)buffer);
8570 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008571}
Christian Heimes3fd13992008-03-21 01:05:49 +00008572#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008573
Guido van Rossum078151d2002-08-11 04:24:12 +00008574/* XXX To save some code duplication, formatfloat/long/int could have been
8575 shared with stringobject.c, converting from 8-bit to Unicode after the
8576 formatting is done. */
8577
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578static int
8579formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008580 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 int flags,
8582 int prec,
8583 int type,
8584 PyObject *v)
8585{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008586 /* fmt = '%#.' + `prec` + `type`
8587 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 char fmt[20];
8589 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008590
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 x = PyFloat_AsDouble(v);
8592 if (x == -1.0 && PyErr_Occurred())
8593 return -1;
8594 if (prec < 0)
8595 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008596 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8597 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008598 /* Worst case length calc to ensure no buffer overrun:
8599
8600 'g' formats:
8601 fmt = %#.<prec>g
8602 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8603 for any double rep.)
8604 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8605
8606 'f' formats:
8607 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8608 len = 1 + 50 + 1 + prec = 52 + prec
8609
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008610 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008611 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008612
8613 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008614 if (((type == 'g' || type == 'G') &&
8615 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008616 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008617 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008618 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008619 return -1;
8620 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008621 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8622 (flags&F_ALT) ? "#" : "",
8623 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008624 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625}
8626
Tim Peters38fd5b62000-09-21 05:43:11 +00008627static PyObject*
8628formatlong(PyObject *val, int flags, int prec, int type)
8629{
8630 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008631 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008632 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008633 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008634
Christian Heimes72b710a2008-05-26 13:28:38 +00008635 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008636 if (!str)
8637 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008638 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008639 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008640 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008641}
8642
Christian Heimes3fd13992008-03-21 01:05:49 +00008643#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644static int
8645formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008646 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647 int flags,
8648 int prec,
8649 int type,
8650 PyObject *v)
8651{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008652 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008653 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8654 * + 1 + 1
8655 * = 24
8656 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008657 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008658 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 long x;
8660
Christian Heimes217cfd12007-12-02 14:31:20 +00008661 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008663 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008664 if (x < 0 && type == 'u') {
8665 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008666 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008667 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8668 sign = "-";
8669 else
8670 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008672 prec = 1;
8673
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008674 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8675 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008676 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008677 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008678 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008679 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008680 return -1;
8681 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008682
8683 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008684 (type == 'x' || type == 'X' || type == 'o')) {
8685 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008686 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008687 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008688 * - when 0 is being converted, the C standard leaves off
8689 * the '0x' or '0X', which is inconsistent with other
8690 * %#x/%#X conversions and inconsistent with Python's
8691 * hex() function
8692 * - there are platforms that violate the standard and
8693 * convert 0 with the '0x' or '0X'
8694 * (Metrowerks, Compaq Tru64)
8695 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008696 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008697 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008698 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008699 * We can achieve the desired consistency by inserting our
8700 * own '0x' or '0X' prefix, and substituting %x/%X in place
8701 * of %#x/%#X.
8702 *
8703 * Note that this is the same approach as used in
8704 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008705 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008706 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8707 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008708 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008709 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008710 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8711 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008712 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008713 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008714 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008715 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008716 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008717 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718}
Christian Heimes3fd13992008-03-21 01:05:49 +00008719#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720
8721static int
8722formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008723 size_t buflen,
8724 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008726 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008727 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008728 if (PyUnicode_GET_SIZE(v) == 1) {
8729 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8730 buf[1] = '\0';
8731 return 1;
8732 }
8733#ifndef Py_UNICODE_WIDE
8734 if (PyUnicode_GET_SIZE(v) == 2) {
8735 /* Decode a valid surrogate pair */
8736 int c0 = PyUnicode_AS_UNICODE(v)[0];
8737 int c1 = PyUnicode_AS_UNICODE(v)[1];
8738 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8739 0xDC00 <= c1 && c1 <= 0xDFFF) {
8740 buf[0] = c0;
8741 buf[1] = c1;
8742 buf[2] = '\0';
8743 return 2;
8744 }
8745 }
8746#endif
8747 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 else {
8750 /* Integer input truncated to a character */
8751 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008752 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008754 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008755
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008756 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008757 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008758 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008759 return -1;
8760 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008761
8762#ifndef Py_UNICODE_WIDE
8763 if (x > 0xffff) {
8764 x -= 0x10000;
8765 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8766 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8767 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008768 }
8769#endif
8770 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008771 buf[1] = '\0';
8772 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008774
8775 onError:
8776 PyErr_SetString(PyExc_TypeError,
8777 "%c requires int or char");
8778 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779}
8780
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008781/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8782
8783 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8784 chars are formatted. XXX This is a magic number. Each formatting
8785 routine does bounds checking to ensure no overflow, but a better
8786 solution may be to malloc a buffer of appropriate size for each
8787 format. For now, the current solution is sufficient.
8788*/
8789#define FORMATBUFLEN (size_t)120
8790
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791PyObject *PyUnicode_Format(PyObject *format,
8792 PyObject *args)
8793{
8794 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008795 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 int args_owned = 0;
8797 PyUnicodeObject *result = NULL;
8798 PyObject *dict = NULL;
8799 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008800
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 if (format == NULL || args == NULL) {
8802 PyErr_BadInternalCall();
8803 return NULL;
8804 }
8805 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008806 if (uformat == NULL)
8807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808 fmt = PyUnicode_AS_UNICODE(uformat);
8809 fmtcnt = PyUnicode_GET_SIZE(uformat);
8810
8811 reslen = rescnt = fmtcnt + 100;
8812 result = _PyUnicode_New(reslen);
8813 if (result == NULL)
8814 goto onError;
8815 res = PyUnicode_AS_UNICODE(result);
8816
8817 if (PyTuple_Check(args)) {
8818 arglen = PyTuple_Size(args);
8819 argidx = 0;
8820 }
8821 else {
8822 arglen = -1;
8823 argidx = -2;
8824 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008825 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008826 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 dict = args;
8828
8829 while (--fmtcnt >= 0) {
8830 if (*fmt != '%') {
8831 if (--rescnt < 0) {
8832 rescnt = fmtcnt + 100;
8833 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008834 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008835 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8837 --rescnt;
8838 }
8839 *res++ = *fmt++;
8840 }
8841 else {
8842 /* Got a format specifier */
8843 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008844 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 Py_UNICODE c = '\0';
8847 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00008848 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 PyObject *v = NULL;
8850 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008851 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008853 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008854 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855
8856 fmt++;
8857 if (*fmt == '(') {
8858 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008859 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860 PyObject *key;
8861 int pcount = 1;
8862
8863 if (dict == NULL) {
8864 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008865 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866 goto onError;
8867 }
8868 ++fmt;
8869 --fmtcnt;
8870 keystart = fmt;
8871 /* Skip over balanced parentheses */
8872 while (pcount > 0 && --fmtcnt >= 0) {
8873 if (*fmt == ')')
8874 --pcount;
8875 else if (*fmt == '(')
8876 ++pcount;
8877 fmt++;
8878 }
8879 keylen = fmt - keystart - 1;
8880 if (fmtcnt < 0 || pcount > 0) {
8881 PyErr_SetString(PyExc_ValueError,
8882 "incomplete format key");
8883 goto onError;
8884 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008885#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008886 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 then looked up since Python uses strings to hold
8888 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008889 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 key = PyUnicode_EncodeUTF8(keystart,
8891 keylen,
8892 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008893#else
8894 key = PyUnicode_FromUnicode(keystart, keylen);
8895#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896 if (key == NULL)
8897 goto onError;
8898 if (args_owned) {
8899 Py_DECREF(args);
8900 args_owned = 0;
8901 }
8902 args = PyObject_GetItem(dict, key);
8903 Py_DECREF(key);
8904 if (args == NULL) {
8905 goto onError;
8906 }
8907 args_owned = 1;
8908 arglen = -1;
8909 argidx = -2;
8910 }
8911 while (--fmtcnt >= 0) {
8912 switch (c = *fmt++) {
8913 case '-': flags |= F_LJUST; continue;
8914 case '+': flags |= F_SIGN; continue;
8915 case ' ': flags |= F_BLANK; continue;
8916 case '#': flags |= F_ALT; continue;
8917 case '0': flags |= F_ZERO; continue;
8918 }
8919 break;
8920 }
8921 if (c == '*') {
8922 v = getnextarg(args, arglen, &argidx);
8923 if (v == NULL)
8924 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008925 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926 PyErr_SetString(PyExc_TypeError,
8927 "* wants int");
8928 goto onError;
8929 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008930 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008931 if (width == -1 && PyErr_Occurred())
8932 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 if (width < 0) {
8934 flags |= F_LJUST;
8935 width = -width;
8936 }
8937 if (--fmtcnt >= 0)
8938 c = *fmt++;
8939 }
8940 else if (c >= '0' && c <= '9') {
8941 width = c - '0';
8942 while (--fmtcnt >= 0) {
8943 c = *fmt++;
8944 if (c < '0' || c > '9')
8945 break;
8946 if ((width*10) / 10 != width) {
8947 PyErr_SetString(PyExc_ValueError,
8948 "width too big");
8949 goto onError;
8950 }
8951 width = width*10 + (c - '0');
8952 }
8953 }
8954 if (c == '.') {
8955 prec = 0;
8956 if (--fmtcnt >= 0)
8957 c = *fmt++;
8958 if (c == '*') {
8959 v = getnextarg(args, arglen, &argidx);
8960 if (v == NULL)
8961 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00008962 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 PyErr_SetString(PyExc_TypeError,
8964 "* wants int");
8965 goto onError;
8966 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008967 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008968 if (prec == -1 && PyErr_Occurred())
8969 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 if (prec < 0)
8971 prec = 0;
8972 if (--fmtcnt >= 0)
8973 c = *fmt++;
8974 }
8975 else if (c >= '0' && c <= '9') {
8976 prec = c - '0';
8977 while (--fmtcnt >= 0) {
8978 c = Py_CHARMASK(*fmt++);
8979 if (c < '0' || c > '9')
8980 break;
8981 if ((prec*10) / 10 != prec) {
8982 PyErr_SetString(PyExc_ValueError,
8983 "prec too big");
8984 goto onError;
8985 }
8986 prec = prec*10 + (c - '0');
8987 }
8988 }
8989 } /* prec */
8990 if (fmtcnt >= 0) {
8991 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992 if (--fmtcnt >= 0)
8993 c = *fmt++;
8994 }
8995 }
8996 if (fmtcnt < 0) {
8997 PyErr_SetString(PyExc_ValueError,
8998 "incomplete format");
8999 goto onError;
9000 }
9001 if (c != '%') {
9002 v = getnextarg(args, arglen, &argidx);
9003 if (v == NULL)
9004 goto onError;
9005 }
9006 sign = 0;
9007 fill = ' ';
9008 switch (c) {
9009
9010 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009011 pbuf = formatbuf;
9012 /* presume that buffer length is at least 1 */
9013 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 len = 1;
9015 break;
9016
9017 case 's':
9018 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009019 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 if (PyUnicode_Check(v) && c == 's') {
9021 temp = v;
9022 Py_INCREF(temp);
9023 }
9024 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009026 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009027 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009029 else
9030 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 if (temp == NULL)
9032 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009033 if (PyUnicode_Check(temp))
9034 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009035 else {
9036 Py_DECREF(temp);
9037 PyErr_SetString(PyExc_TypeError,
9038 "%s argument has non-string str()");
9039 goto onError;
9040 }
9041 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009042 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 len = PyUnicode_GET_SIZE(temp);
9044 if (prec >= 0 && len > prec)
9045 len = prec;
9046 break;
9047
9048 case 'i':
9049 case 'd':
9050 case 'u':
9051 case 'o':
9052 case 'x':
9053 case 'X':
9054 if (c == 'i')
9055 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009056 isnumok = 0;
9057 if (PyNumber_Check(v)) {
9058 PyObject *iobj=NULL;
9059
9060 if (PyLong_Check(v)) {
9061 iobj = v;
9062 Py_INCREF(iobj);
9063 }
9064 else {
9065 iobj = PyNumber_Long(v);
9066 }
9067 if (iobj!=NULL) {
9068 if (PyLong_Check(iobj)) {
9069 isnumok = 1;
9070 temp = formatlong(iobj, flags, prec, c);
9071 Py_DECREF(iobj);
9072 if (!temp)
9073 goto onError;
9074 pbuf = PyUnicode_AS_UNICODE(temp);
9075 len = PyUnicode_GET_SIZE(temp);
9076 sign = 1;
9077 }
9078 else {
9079 Py_DECREF(iobj);
9080 }
9081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009083 if (!isnumok) {
9084 PyErr_Format(PyExc_TypeError,
9085 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009086 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009087 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009088 }
9089 if (flags & F_ZERO)
9090 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 break;
9092
9093 case 'e':
9094 case 'E':
9095 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009096 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 case 'g':
9098 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009099 if (c == 'F')
9100 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009101 pbuf = formatbuf;
9102 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9103 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 if (len < 0)
9105 goto onError;
9106 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009107 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 fill = '0';
9109 break;
9110
9111 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009112 pbuf = formatbuf;
9113 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114 if (len < 0)
9115 goto onError;
9116 break;
9117
9118 default:
9119 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009120 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009121 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009122 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009123 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009124 (Py_ssize_t)(fmt - 1 -
9125 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 goto onError;
9127 }
9128 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009129 if (*pbuf == '-' || *pbuf == '+') {
9130 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131 len--;
9132 }
9133 else if (flags & F_SIGN)
9134 sign = '+';
9135 else if (flags & F_BLANK)
9136 sign = ' ';
9137 else
9138 sign = 0;
9139 }
9140 if (width < len)
9141 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009142 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143 reslen -= rescnt;
9144 rescnt = width + fmtcnt + 100;
9145 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009146 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009147 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009148 PyErr_NoMemory();
9149 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009150 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009151 if (_PyUnicode_Resize(&result, reslen) < 0) {
9152 Py_XDECREF(temp);
9153 goto onError;
9154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 res = PyUnicode_AS_UNICODE(result)
9156 + reslen - rescnt;
9157 }
9158 if (sign) {
9159 if (fill != ' ')
9160 *res++ = sign;
9161 rescnt--;
9162 if (width > len)
9163 width--;
9164 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009165 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009166 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009167 assert(pbuf[1] == c);
9168 if (fill != ' ') {
9169 *res++ = *pbuf++;
9170 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009171 }
Tim Petersfff53252001-04-12 18:38:48 +00009172 rescnt -= 2;
9173 width -= 2;
9174 if (width < 0)
9175 width = 0;
9176 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178 if (width > len && !(flags & F_LJUST)) {
9179 do {
9180 --rescnt;
9181 *res++ = fill;
9182 } while (--width > len);
9183 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009184 if (fill == ' ') {
9185 if (sign)
9186 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009187 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009188 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009189 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009190 *res++ = *pbuf++;
9191 *res++ = *pbuf++;
9192 }
9193 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009194 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195 res += len;
9196 rescnt -= len;
9197 while (--width >= len) {
9198 --rescnt;
9199 *res++ = ' ';
9200 }
9201 if (dict && (argidx < arglen) && c != '%') {
9202 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009203 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009204 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205 goto onError;
9206 }
9207 Py_XDECREF(temp);
9208 } /* '%' */
9209 } /* until end */
9210 if (argidx < arglen && !dict) {
9211 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009212 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213 goto onError;
9214 }
9215
Thomas Woutersa96affe2006-03-12 00:29:36 +00009216 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9217 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218 if (args_owned) {
9219 Py_DECREF(args);
9220 }
9221 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222 return (PyObject *)result;
9223
9224 onError:
9225 Py_XDECREF(result);
9226 Py_DECREF(uformat);
9227 if (args_owned) {
9228 Py_DECREF(args);
9229 }
9230 return NULL;
9231}
9232
Jeremy Hylton938ace62002-07-17 16:30:39 +00009233static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009234unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9235
Tim Peters6d6c1a32001-08-02 04:15:00 +00009236static PyObject *
9237unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9238{
9239 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009240 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009241 char *encoding = NULL;
9242 char *errors = NULL;
9243
Guido van Rossume023fe02001-08-30 03:12:59 +00009244 if (type != &PyUnicode_Type)
9245 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009246 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009247 kwlist, &x, &encoding, &errors))
9248 return NULL;
9249 if (x == NULL)
9250 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009251 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009252 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009253 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009254 return PyUnicode_FromEncodedObject(x, encoding, errors);
9255}
9256
Guido van Rossume023fe02001-08-30 03:12:59 +00009257static PyObject *
9258unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9259{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009260 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009261 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009262
9263 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9264 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9265 if (tmp == NULL)
9266 return NULL;
9267 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009268 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009269 if (pnew == NULL) {
9270 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009271 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009272 }
Christian Heimesb186d002008-03-18 15:15:01 +00009273 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009274 if (pnew->str == NULL) {
9275 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009276 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009277 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009278 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009279 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009280 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9281 pnew->length = n;
9282 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009283 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009284 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009285}
9286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009287PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009288"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009289\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009290Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009291encoding defaults to the current default string encoding.\n\
9292errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009293
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009294static PyObject *unicode_iter(PyObject *seq);
9295
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009297 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009298 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299 sizeof(PyUnicodeObject), /* tp_size */
9300 0, /* tp_itemsize */
9301 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009302 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009304 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009306 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009307 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009308 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009310 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311 (hashfunc) unicode_hash, /* tp_hash*/
9312 0, /* tp_call*/
9313 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009314 PyObject_GenericGetAttr, /* tp_getattro */
9315 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009316 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009317 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9318 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009319 unicode_doc, /* tp_doc */
9320 0, /* tp_traverse */
9321 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009322 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009323 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009324 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009325 0, /* tp_iternext */
9326 unicode_methods, /* tp_methods */
9327 0, /* tp_members */
9328 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009329 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009330 0, /* tp_dict */
9331 0, /* tp_descr_get */
9332 0, /* tp_descr_set */
9333 0, /* tp_dictoffset */
9334 0, /* tp_init */
9335 0, /* tp_alloc */
9336 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009337 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338};
9339
9340/* Initialize the Unicode implementation */
9341
Thomas Wouters78890102000-07-22 19:25:51 +00009342void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009344 int i;
9345
Thomas Wouters477c8d52006-05-27 19:21:47 +00009346 /* XXX - move this array to unicodectype.c ? */
9347 Py_UNICODE linebreak[] = {
9348 0x000A, /* LINE FEED */
9349 0x000D, /* CARRIAGE RETURN */
9350 0x001C, /* FILE SEPARATOR */
9351 0x001D, /* GROUP SEPARATOR */
9352 0x001E, /* RECORD SEPARATOR */
9353 0x0085, /* NEXT LINE */
9354 0x2028, /* LINE SEPARATOR */
9355 0x2029, /* PARAGRAPH SEPARATOR */
9356 };
9357
Fred Drakee4315f52000-05-09 19:53:39 +00009358 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009359 free_list = NULL;
9360 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009362 if (!unicode_empty)
9363 return;
9364
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009365 for (i = 0; i < 256; i++)
9366 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009367 if (PyType_Ready(&PyUnicode_Type) < 0)
9368 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009369
9370 /* initialize the linebreak bloom filter */
9371 bloom_linebreak = make_bloom_mask(
9372 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9373 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009374
9375 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376}
9377
9378/* Finalize the Unicode implementation */
9379
Christian Heimesa156e092008-02-16 07:38:31 +00009380int
9381PyUnicode_ClearFreeList(void)
9382{
9383 int freelist_size = numfree;
9384 PyUnicodeObject *u;
9385
9386 for (u = free_list; u != NULL;) {
9387 PyUnicodeObject *v = u;
9388 u = *(PyUnicodeObject **)u;
9389 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009390 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009391 Py_XDECREF(v->defenc);
9392 PyObject_Del(v);
9393 numfree--;
9394 }
9395 free_list = NULL;
9396 assert(numfree == 0);
9397 return freelist_size;
9398}
9399
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400void
Thomas Wouters78890102000-07-22 19:25:51 +00009401_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009403 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009405 Py_XDECREF(unicode_empty);
9406 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009407
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009408 for (i = 0; i < 256; i++) {
9409 if (unicode_latin1[i]) {
9410 Py_DECREF(unicode_latin1[i]);
9411 unicode_latin1[i] = NULL;
9412 }
9413 }
Christian Heimesa156e092008-02-16 07:38:31 +00009414 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009416
Walter Dörwald16807132007-05-25 13:52:07 +00009417void
9418PyUnicode_InternInPlace(PyObject **p)
9419{
9420 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9421 PyObject *t;
9422 if (s == NULL || !PyUnicode_Check(s))
9423 Py_FatalError(
9424 "PyUnicode_InternInPlace: unicode strings only please!");
9425 /* If it's a subclass, we don't really know what putting
9426 it in the interned dict might do. */
9427 if (!PyUnicode_CheckExact(s))
9428 return;
9429 if (PyUnicode_CHECK_INTERNED(s))
9430 return;
9431 if (interned == NULL) {
9432 interned = PyDict_New();
9433 if (interned == NULL) {
9434 PyErr_Clear(); /* Don't leave an exception */
9435 return;
9436 }
9437 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009438 /* It might be that the GetItem call fails even
9439 though the key is present in the dictionary,
9440 namely when this happens during a stack overflow. */
9441 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009442 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009443 Py_END_ALLOW_RECURSION
9444
Walter Dörwald16807132007-05-25 13:52:07 +00009445 if (t) {
9446 Py_INCREF(t);
9447 Py_DECREF(*p);
9448 *p = t;
9449 return;
9450 }
9451
Martin v. Löwis5b222132007-06-10 09:51:05 +00009452 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009453 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9454 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009455 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009456 return;
9457 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009458 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009459 /* The two references in interned are not counted by refcnt.
9460 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009461 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009462 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9463}
9464
9465void
9466PyUnicode_InternImmortal(PyObject **p)
9467{
9468 PyUnicode_InternInPlace(p);
9469 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9470 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9471 Py_INCREF(*p);
9472 }
9473}
9474
9475PyObject *
9476PyUnicode_InternFromString(const char *cp)
9477{
9478 PyObject *s = PyUnicode_FromString(cp);
9479 if (s == NULL)
9480 return NULL;
9481 PyUnicode_InternInPlace(&s);
9482 return s;
9483}
9484
9485void _Py_ReleaseInternedUnicodeStrings(void)
9486{
9487 PyObject *keys;
9488 PyUnicodeObject *s;
9489 Py_ssize_t i, n;
9490 Py_ssize_t immortal_size = 0, mortal_size = 0;
9491
9492 if (interned == NULL || !PyDict_Check(interned))
9493 return;
9494 keys = PyDict_Keys(interned);
9495 if (keys == NULL || !PyList_Check(keys)) {
9496 PyErr_Clear();
9497 return;
9498 }
9499
9500 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9501 detector, interned unicode strings are not forcibly deallocated;
9502 rather, we give them their stolen references back, and then clear
9503 and DECREF the interned dict. */
9504
9505 n = PyList_GET_SIZE(keys);
9506 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9507 n);
9508 for (i = 0; i < n; i++) {
9509 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9510 switch (s->state) {
9511 case SSTATE_NOT_INTERNED:
9512 /* XXX Shouldn't happen */
9513 break;
9514 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009515 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009516 immortal_size += s->length;
9517 break;
9518 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009519 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009520 mortal_size += s->length;
9521 break;
9522 default:
9523 Py_FatalError("Inconsistent interned string state.");
9524 }
9525 s->state = SSTATE_NOT_INTERNED;
9526 }
9527 fprintf(stderr, "total size of all interned strings: "
9528 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9529 "mortal/immortal\n", mortal_size, immortal_size);
9530 Py_DECREF(keys);
9531 PyDict_Clear(interned);
9532 Py_DECREF(interned);
9533 interned = NULL;
9534}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009535
9536
9537/********************* Unicode Iterator **************************/
9538
9539typedef struct {
9540 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009541 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009542 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9543} unicodeiterobject;
9544
9545static void
9546unicodeiter_dealloc(unicodeiterobject *it)
9547{
9548 _PyObject_GC_UNTRACK(it);
9549 Py_XDECREF(it->it_seq);
9550 PyObject_GC_Del(it);
9551}
9552
9553static int
9554unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9555{
9556 Py_VISIT(it->it_seq);
9557 return 0;
9558}
9559
9560static PyObject *
9561unicodeiter_next(unicodeiterobject *it)
9562{
9563 PyUnicodeObject *seq;
9564 PyObject *item;
9565
9566 assert(it != NULL);
9567 seq = it->it_seq;
9568 if (seq == NULL)
9569 return NULL;
9570 assert(PyUnicode_Check(seq));
9571
9572 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009573 item = PyUnicode_FromUnicode(
9574 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009575 if (item != NULL)
9576 ++it->it_index;
9577 return item;
9578 }
9579
9580 Py_DECREF(seq);
9581 it->it_seq = NULL;
9582 return NULL;
9583}
9584
9585static PyObject *
9586unicodeiter_len(unicodeiterobject *it)
9587{
9588 Py_ssize_t len = 0;
9589 if (it->it_seq)
9590 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009591 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009592}
9593
9594PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9595
9596static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009597 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9598 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009599 {NULL, NULL} /* sentinel */
9600};
9601
9602PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009603 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009604 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009605 sizeof(unicodeiterobject), /* tp_basicsize */
9606 0, /* tp_itemsize */
9607 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009608 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009609 0, /* tp_print */
9610 0, /* tp_getattr */
9611 0, /* tp_setattr */
9612 0, /* tp_compare */
9613 0, /* tp_repr */
9614 0, /* tp_as_number */
9615 0, /* tp_as_sequence */
9616 0, /* tp_as_mapping */
9617 0, /* tp_hash */
9618 0, /* tp_call */
9619 0, /* tp_str */
9620 PyObject_GenericGetAttr, /* tp_getattro */
9621 0, /* tp_setattro */
9622 0, /* tp_as_buffer */
9623 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9624 0, /* tp_doc */
9625 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9626 0, /* tp_clear */
9627 0, /* tp_richcompare */
9628 0, /* tp_weaklistoffset */
9629 PyObject_SelfIter, /* tp_iter */
9630 (iternextfunc)unicodeiter_next, /* tp_iternext */
9631 unicodeiter_methods, /* tp_methods */
9632 0,
9633};
9634
9635static PyObject *
9636unicode_iter(PyObject *seq)
9637{
9638 unicodeiterobject *it;
9639
9640 if (!PyUnicode_Check(seq)) {
9641 PyErr_BadInternalCall();
9642 return NULL;
9643 }
9644 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9645 if (it == NULL)
9646 return NULL;
9647 it->it_index = 0;
9648 Py_INCREF(seq);
9649 it->it_seq = (PyUnicodeObject *)seq;
9650 _PyObject_GC_TRACK(it);
9651 return (PyObject *)it;
9652}
9653
Martin v. Löwis5b222132007-06-10 09:51:05 +00009654size_t
9655Py_UNICODE_strlen(const Py_UNICODE *u)
9656{
9657 int res = 0;
9658 while(*u++)
9659 res++;
9660 return res;
9661}
9662
9663Py_UNICODE*
9664Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9665{
9666 Py_UNICODE *u = s1;
9667 while ((*u++ = *s2++));
9668 return s1;
9669}
9670
9671Py_UNICODE*
9672Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9673{
9674 Py_UNICODE *u = s1;
9675 while ((*u++ = *s2++))
9676 if (n-- == 0)
9677 break;
9678 return s1;
9679}
9680
9681int
9682Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9683{
9684 while (*s1 && *s2 && *s1 == *s2)
9685 s1++, s2++;
9686 if (*s1 && *s2)
9687 return (*s1 < *s2) ? -1 : +1;
9688 if (*s1)
9689 return 1;
9690 if (*s2)
9691 return -1;
9692 return 0;
9693}
9694
9695Py_UNICODE*
9696Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9697{
9698 const Py_UNICODE *p;
9699 for (p = s; *p; p++)
9700 if (*p == c)
9701 return (Py_UNICODE*)p;
9702 return NULL;
9703}
9704
9705
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009706#ifdef __cplusplus
9707}
9708#endif
9709
9710
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009711/*
9712Local variables:
9713c-basic-offset: 4
9714indent-tabs-mode: nil
9715End:
9716*/