blob: 28b8c66295aae53f3a8346aba7b0c5bab898aeb5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
Antoine Pitrouf068f942010-01-13 14:19:12 +0000209#if LONG_BIT >= 128
210#define BLOOM_WIDTH 128
211#elif LONG_BIT >= 64
212#define BLOOM_WIDTH 64
213#elif LONG_BIT >= 32
214#define BLOOM_WIDTH 32
215#else
216#error "LONG_BIT is smaller than 32"
217#endif
218
Thomas Wouters477c8d52006-05-27 19:21:47 +0000219#define BLOOM_MASK unsigned long
220
221static BLOOM_MASK bloom_linebreak;
222
Antoine Pitrouf068f942010-01-13 14:19:12 +0000223#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
Benjamin Peterson29060642009-01-31 22:14:21 +0000226#define BLOOM_LINEBREAK(ch) \
227 ((ch) < 128U ? ascii_linebreak[(ch)] : \
228 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000229
230Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
231{
232 /* calculate simple bloom-style bitmask for a given unicode string */
233
Antoine Pitrouf068f942010-01-13 14:19:12 +0000234 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235 Py_ssize_t i;
236
237 mask = 0;
238 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000239 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000240
241 return mask;
242}
243
244Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
260static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
302 Py_DECREF(unicode->defenc);
303 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 }
305 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 return 0;
308}
309
310/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000311 Ux0000 terminated; some code (e.g. new_identifier)
312 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000315 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316
317*/
318
319static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000320PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321{
322 register PyUnicodeObject *unicode;
323
Thomas Wouters477c8d52006-05-27 19:21:47 +0000324 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000325 if (length == 0 && unicode_empty != NULL) {
326 Py_INCREF(unicode_empty);
327 return unicode_empty;
328 }
329
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000330 /* Ensure we won't overflow the size. */
331 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
332 return (PyUnicodeObject *)PyErr_NoMemory();
333 }
334
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000336 if (free_list) {
337 unicode = free_list;
338 free_list = *(PyUnicodeObject **)unicode;
339 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 if (unicode->str) {
341 /* Keep-Alive optimization: we only upsize the buffer,
342 never downsize it. */
343 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000344 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 PyObject_DEL(unicode->str);
346 unicode->str = NULL;
347 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000348 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000350 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
351 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000352 }
353 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354 }
355 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000356 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000357 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358 if (unicode == NULL)
359 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
361 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000365 PyErr_NoMemory();
366 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000367 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000368 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000369 * the caller fails before initializing str -- unicode_resize()
370 * reads str[0], and the Keep-Alive optimization can keep memory
371 * allocated for str alive across a call to unicode_dealloc(unicode).
372 * We don't want unicode_resize to read uninitialized memory in
373 * that case.
374 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000375 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000379 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000380 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382
Benjamin Peterson29060642009-01-31 22:14:21 +0000383 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000384 /* XXX UNREF/NEWREF interface should be more symmetrical */
385 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000387 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389}
390
391static
Guido van Rossum9475a232001-10-05 20:51:39 +0000392void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393{
Walter Dörwald16807132007-05-25 13:52:07 +0000394 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_NOT_INTERNED:
396 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 case SSTATE_INTERNED_MORTAL:
399 /* revive dead object temporarily for DelItem */
400 Py_REFCNT(unicode) = 3;
401 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
402 Py_FatalError(
403 "deletion of interned string failed");
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_IMMORTAL:
407 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000408
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 default:
410 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000411 }
412
Guido van Rossum604ddf82001-12-06 20:03:56 +0000413 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000415 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000416 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
417 PyObject_DEL(unicode->str);
418 unicode->str = NULL;
419 unicode->length = 0;
420 }
421 if (unicode->defenc) {
422 Py_DECREF(unicode->defenc);
423 unicode->defenc = NULL;
424 }
425 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000426 *(PyUnicodeObject **)unicode = free_list;
427 free_list = unicode;
428 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429 }
430 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000431 PyObject_DEL(unicode->str);
432 Py_XDECREF(unicode->defenc);
433 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000434 }
435}
436
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000437static
438int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000439{
440 register PyUnicodeObject *v;
441
442 /* Argument checks */
443 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000444 PyErr_BadInternalCall();
445 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000447 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000448 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000449 PyErr_BadInternalCall();
450 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000451 }
452
453 /* Resizing unicode_empty and single character objects is not
454 possible since these are being shared. We simply return a fresh
455 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000456 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 (v == unicode_empty || v->length == 1)) {
458 PyUnicodeObject *w = _PyUnicode_New(length);
459 if (w == NULL)
460 return -1;
461 Py_UNICODE_COPY(w->str, v->str,
462 length < v->length ? length : v->length);
463 Py_DECREF(*unicode);
464 *unicode = w;
465 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466 }
467
468 /* Note that we don't have to modify *unicode for unshared Unicode
469 objects, since we can modify them in-place. */
470 return unicode_resize(v, length);
471}
472
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
474{
475 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
476}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477
Guido van Rossumd57fd912000-03-10 22:53:23 +0000478PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000479 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480{
481 PyUnicodeObject *unicode;
482
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483 /* If the Unicode data is known at construction time, we can apply
484 some optimizations which share commonly used objects. */
485 if (u != NULL) {
486
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 /* Optimization for empty strings */
488 if (size == 0 && unicode_empty != NULL) {
489 Py_INCREF(unicode_empty);
490 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000491 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000492
493 /* Single character Unicode objects in the Latin-1 range are
494 shared when using this constructor */
495 if (size == 1 && *u < 256) {
496 unicode = unicode_latin1[*u];
497 if (!unicode) {
498 unicode = _PyUnicode_New(1);
499 if (!unicode)
500 return NULL;
501 unicode->str[0] = *u;
502 unicode_latin1[*u] = unicode;
503 }
504 Py_INCREF(unicode);
505 return (PyObject *)unicode;
506 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000507 }
Tim Petersced69f82003-09-16 20:30:58 +0000508
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 unicode = _PyUnicode_New(size);
510 if (!unicode)
511 return NULL;
512
513 /* Copy the Unicode data into the new object */
514 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516
517 return (PyObject *)unicode;
518}
519
Walter Dörwaldd2034312007-05-18 16:29:38 +0000520PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Walter Dörwaldd2034312007-05-18 16:29:38 +0000567PyObject *PyUnicode_FromString(const char *u)
568{
569 size_t size = strlen(u);
570 if (size > PY_SSIZE_T_MAX) {
571 PyErr_SetString(PyExc_OverflowError, "input too long");
572 return NULL;
573 }
574
575 return PyUnicode_FromStringAndSize(u, size);
576}
577
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578#ifdef HAVE_WCHAR_H
579
Mark Dickinson081dfee2009-03-18 14:47:41 +0000580#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
581# define CONVERT_WCHAR_TO_SURROGATES
582#endif
583
584#ifdef CONVERT_WCHAR_TO_SURROGATES
585
586/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
587 to convert from UTF32 to UTF16. */
588
589PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
590 Py_ssize_t size)
591{
592 PyUnicodeObject *unicode;
593 register Py_ssize_t i;
594 Py_ssize_t alloc;
595 const wchar_t *orig_w;
596
597 if (w == NULL) {
598 if (size == 0)
599 return PyUnicode_FromStringAndSize(NULL, 0);
600 PyErr_BadInternalCall();
601 return NULL;
602 }
603
604 if (size == -1) {
605 size = wcslen(w);
606 }
607
608 alloc = size;
609 orig_w = w;
610 for (i = size; i > 0; i--) {
611 if (*w > 0xFFFF)
612 alloc++;
613 w++;
614 }
615 w = orig_w;
616 unicode = _PyUnicode_New(alloc);
617 if (!unicode)
618 return NULL;
619
620 /* Copy the wchar_t data into the new object */
621 {
622 register Py_UNICODE *u;
623 u = PyUnicode_AS_UNICODE(unicode);
624 for (i = size; i > 0; i--) {
625 if (*w > 0xFFFF) {
626 wchar_t ordinal = *w++;
627 ordinal -= 0x10000;
628 *u++ = 0xD800 | (ordinal >> 10);
629 *u++ = 0xDC00 | (ordinal & 0x3FF);
630 }
631 else
632 *u++ = *w++;
633 }
634 }
635 return (PyObject *)unicode;
636}
637
638#else
639
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000641 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000642{
643 PyUnicodeObject *unicode;
644
645 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == 0)
647 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000648 PyErr_BadInternalCall();
649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 }
651
Martin v. Löwis790465f2008-04-05 20:41:37 +0000652 if (size == -1) {
653 size = wcslen(w);
654 }
655
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 unicode = _PyUnicode_New(size);
657 if (!unicode)
658 return NULL;
659
660 /* Copy the wchar_t data into the new object */
661#ifdef HAVE_USABLE_WCHAR_T
662 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000663#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000665 register Py_UNICODE *u;
666 register Py_ssize_t i;
667 u = PyUnicode_AS_UNICODE(unicode);
668 for (i = size; i > 0; i--)
669 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 }
671#endif
672
673 return (PyObject *)unicode;
674}
675
Mark Dickinson081dfee2009-03-18 14:47:41 +0000676#endif /* CONVERT_WCHAR_TO_SURROGATES */
677
678#undef CONVERT_WCHAR_TO_SURROGATES
679
Walter Dörwald346737f2007-05-31 10:44:43 +0000680static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000681makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
682 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000683{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000684 *fmt++ = '%';
685 if (width) {
686 if (zeropad)
687 *fmt++ = '0';
688 fmt += sprintf(fmt, "%d", width);
689 }
690 if (precision)
691 fmt += sprintf(fmt, ".%d", precision);
692 if (longflag)
693 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000694 else if (longlongflag) {
695 /* longlongflag should only ever be nonzero on machines with
696 HAVE_LONG_LONG defined */
697#ifdef HAVE_LONG_LONG
698 char *f = PY_FORMAT_LONG_LONG;
699 while (*f)
700 *fmt++ = *f++;
701#else
702 /* we shouldn't ever get here */
703 assert(0);
704 *fmt++ = 'l';
705#endif
706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000707 else if (size_tflag) {
708 char *f = PY_FORMAT_SIZE_T;
709 while (*f)
710 *fmt++ = *f++;
711 }
712 *fmt++ = c;
713 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000714}
715
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
717
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000718/* size of fixed-size buffer for formatting single arguments */
719#define ITEM_BUFFER_LEN 21
720/* maximum number of characters required for output of %ld. 21 characters
721 allows for 64-bit integers (in decimal) and an optional sign. */
722#define MAX_LONG_CHARS 21
723/* maximum number of characters required for output of %lld.
724 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
725 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
726#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
727
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728PyObject *
729PyUnicode_FromFormatV(const char *format, va_list vargs)
730{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000731 va_list count;
732 Py_ssize_t callcount = 0;
733 PyObject **callresults = NULL;
734 PyObject **callresult = NULL;
735 Py_ssize_t n = 0;
736 int width = 0;
737 int precision = 0;
738 int zeropad;
739 const char* f;
740 Py_UNICODE *s;
741 PyObject *string;
742 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 /* use abuffer instead of buffer, if we need more space
745 * (which can happen if there's a format specifier with width). */
746 char *abuffer = NULL;
747 char *realbuffer;
748 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000749 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000750 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751
752#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000753 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754#else
755#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000756 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759#endif
760#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000761 /* step 1: count the number of %S/%R/%A/%s format specifications
762 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
763 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
764 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000765 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000766 if (*f == '%') {
767 if (*(f+1)=='%')
768 continue;
769 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
770 ++callcount;
771 while (ISDIGIT((unsigned)*f))
772 width = (width*10) + *f++ - '0';
773 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
774 ;
775 if (*f == 's')
776 ++callcount;
777 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000778 }
779 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000780 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000781 if (callcount) {
782 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
783 if (!callresults) {
784 PyErr_NoMemory();
785 return NULL;
786 }
787 callresult = callresults;
788 }
789 /* step 3: figure out how large a buffer we need */
790 for (f = format; *f; f++) {
791 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000792#ifdef HAVE_LONG_LONG
793 int longlongflag = 0;
794#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 const char* p = f;
796 width = 0;
797 while (ISDIGIT((unsigned)*f))
798 width = (width*10) + *f++ - '0';
799 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
800 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000801
Benjamin Peterson14339b62009-01-31 16:36:08 +0000802 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
803 * they don't affect the amount of space we reserve.
804 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000805 if (*f == 'l') {
806 if (f[1] == 'd' || f[1] == 'u') {
807 ++f;
808 }
809#ifdef HAVE_LONG_LONG
810 else if (f[1] == 'l' &&
811 (f[2] == 'd' || f[2] == 'u')) {
812 longlongflag = 1;
813 f += 2;
814 }
815#endif
816 }
817 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000818 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820
Benjamin Peterson14339b62009-01-31 16:36:08 +0000821 switch (*f) {
822 case 'c':
823 (void)va_arg(count, int);
824 /* fall through... */
825 case '%':
826 n++;
827 break;
828 case 'd': case 'u': case 'i': case 'x':
829 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000830#ifdef HAVE_LONG_LONG
831 if (longlongflag) {
832 if (width < MAX_LONG_LONG_CHARS)
833 width = MAX_LONG_LONG_CHARS;
834 }
835 else
836#endif
837 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
838 including sign. Decimal takes the most space. This
839 isn't enough for octal. If a width is specified we
840 need more (which we allocate later). */
841 if (width < MAX_LONG_CHARS)
842 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000843 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000844 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000845 if (abuffersize < width)
846 abuffersize = width;
847 break;
848 case 's':
849 {
850 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000851 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000852 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
853 if (!str)
854 goto fail;
855 n += PyUnicode_GET_SIZE(str);
856 /* Remember the str and switch to the next slot */
857 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000858 break;
859 }
860 case 'U':
861 {
862 PyObject *obj = va_arg(count, PyObject *);
863 assert(obj && PyUnicode_Check(obj));
864 n += PyUnicode_GET_SIZE(obj);
865 break;
866 }
867 case 'V':
868 {
869 PyObject *obj = va_arg(count, PyObject *);
870 const char *str = va_arg(count, const char *);
871 assert(obj || str);
872 assert(!obj || PyUnicode_Check(obj));
873 if (obj)
874 n += PyUnicode_GET_SIZE(obj);
875 else
876 n += strlen(str);
877 break;
878 }
879 case 'S':
880 {
881 PyObject *obj = va_arg(count, PyObject *);
882 PyObject *str;
883 assert(obj);
884 str = PyObject_Str(obj);
885 if (!str)
886 goto fail;
887 n += PyUnicode_GET_SIZE(str);
888 /* Remember the str and switch to the next slot */
889 *callresult++ = str;
890 break;
891 }
892 case 'R':
893 {
894 PyObject *obj = va_arg(count, PyObject *);
895 PyObject *repr;
896 assert(obj);
897 repr = PyObject_Repr(obj);
898 if (!repr)
899 goto fail;
900 n += PyUnicode_GET_SIZE(repr);
901 /* Remember the repr and switch to the next slot */
902 *callresult++ = repr;
903 break;
904 }
905 case 'A':
906 {
907 PyObject *obj = va_arg(count, PyObject *);
908 PyObject *ascii;
909 assert(obj);
910 ascii = PyObject_ASCII(obj);
911 if (!ascii)
912 goto fail;
913 n += PyUnicode_GET_SIZE(ascii);
914 /* Remember the repr and switch to the next slot */
915 *callresult++ = ascii;
916 break;
917 }
918 case 'p':
919 (void) va_arg(count, int);
920 /* maximum 64-bit pointer representation:
921 * 0xffffffffffffffff
922 * so 19 characters is enough.
923 * XXX I count 18 -- what's the extra for?
924 */
925 n += 19;
926 break;
927 default:
928 /* if we stumble upon an unknown
929 formatting code, copy the rest of
930 the format string to the output
931 string. (we cannot just skip the
932 code, since there's no way to know
933 what's in the argument list) */
934 n += strlen(p);
935 goto expand;
936 }
937 } else
938 n++;
939 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000940 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000941 if (abuffersize > ITEM_BUFFER_LEN) {
942 /* add 1 for sprintf's trailing null byte */
943 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000944 if (!abuffer) {
945 PyErr_NoMemory();
946 goto fail;
947 }
948 realbuffer = abuffer;
949 }
950 else
951 realbuffer = buffer;
952 /* step 4: fill the buffer */
953 /* Since we've analyzed how much space we need for the worst case,
954 we don't have to resize the string.
955 There can be no errors beyond this point. */
956 string = PyUnicode_FromUnicode(NULL, n);
957 if (!string)
958 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000959
Benjamin Peterson14339b62009-01-31 16:36:08 +0000960 s = PyUnicode_AS_UNICODE(string);
961 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000962
Benjamin Peterson14339b62009-01-31 16:36:08 +0000963 for (f = format; *f; f++) {
964 if (*f == '%') {
965 const char* p = f++;
966 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000967 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 int size_tflag = 0;
969 zeropad = (*f == '0');
970 /* parse the width.precision part */
971 width = 0;
972 while (ISDIGIT((unsigned)*f))
973 width = (width*10) + *f++ - '0';
974 precision = 0;
975 if (*f == '.') {
976 f++;
977 while (ISDIGIT((unsigned)*f))
978 precision = (precision*10) + *f++ - '0';
979 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000980 /* Handle %ld, %lu, %lld and %llu. */
981 if (*f == 'l') {
982 if (f[1] == 'd' || f[1] == 'u') {
983 longflag = 1;
984 ++f;
985 }
986#ifdef HAVE_LONG_LONG
987 else if (f[1] == 'l' &&
988 (f[2] == 'd' || f[2] == 'u')) {
989 longlongflag = 1;
990 f += 2;
991 }
992#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000993 }
994 /* handle the size_t flag. */
995 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
996 size_tflag = 1;
997 ++f;
998 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000999
Benjamin Peterson14339b62009-01-31 16:36:08 +00001000 switch (*f) {
1001 case 'c':
1002 *s++ = va_arg(vargs, int);
1003 break;
1004 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001005 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1006 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001007 if (longflag)
1008 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001009#ifdef HAVE_LONG_LONG
1010 else if (longlongflag)
1011 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1012#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001013 else if (size_tflag)
1014 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1015 else
1016 sprintf(realbuffer, fmt, va_arg(vargs, int));
1017 appendstring(realbuffer);
1018 break;
1019 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001020 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1021 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001022 if (longflag)
1023 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001024#ifdef HAVE_LONG_LONG
1025 else if (longlongflag)
1026 sprintf(realbuffer, fmt, va_arg(vargs,
1027 unsigned PY_LONG_LONG));
1028#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001029 else if (size_tflag)
1030 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1031 else
1032 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1033 appendstring(realbuffer);
1034 break;
1035 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001036 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 sprintf(realbuffer, fmt, va_arg(vargs, int));
1038 appendstring(realbuffer);
1039 break;
1040 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001041 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001042 sprintf(realbuffer, fmt, va_arg(vargs, int));
1043 appendstring(realbuffer);
1044 break;
1045 case 's':
1046 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001047 /* unused, since we already have the result */
1048 (void) va_arg(vargs, char *);
1049 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1050 PyUnicode_GET_SIZE(*callresult));
1051 s += PyUnicode_GET_SIZE(*callresult);
1052 /* We're done with the unicode()/repr() => forget it */
1053 Py_DECREF(*callresult);
1054 /* switch to next unicode()/repr() result */
1055 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001056 break;
1057 }
1058 case 'U':
1059 {
1060 PyObject *obj = va_arg(vargs, PyObject *);
1061 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1062 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1063 s += size;
1064 break;
1065 }
1066 case 'V':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 const char *str = va_arg(vargs, const char *);
1070 if (obj) {
1071 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1072 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1073 s += size;
1074 } else {
1075 appendstring(str);
1076 }
1077 break;
1078 }
1079 case 'S':
1080 case 'R':
1081 {
1082 Py_UNICODE *ucopy;
1083 Py_ssize_t usize;
1084 Py_ssize_t upos;
1085 /* unused, since we already have the result */
1086 (void) va_arg(vargs, PyObject *);
1087 ucopy = PyUnicode_AS_UNICODE(*callresult);
1088 usize = PyUnicode_GET_SIZE(*callresult);
1089 for (upos = 0; upos<usize;)
1090 *s++ = ucopy[upos++];
1091 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */
1094 ++callresult;
1095 break;
1096 }
1097 case 'p':
1098 sprintf(buffer, "%p", va_arg(vargs, void*));
1099 /* %p is ill-defined: ensure leading 0x. */
1100 if (buffer[1] == 'X')
1101 buffer[1] = 'x';
1102 else if (buffer[1] != 'x') {
1103 memmove(buffer+2, buffer, strlen(buffer)+1);
1104 buffer[0] = '0';
1105 buffer[1] = 'x';
1106 }
1107 appendstring(buffer);
1108 break;
1109 case '%':
1110 *s++ = '%';
1111 break;
1112 default:
1113 appendstring(p);
1114 goto end;
1115 }
1116 } else
1117 *s++ = *f;
1118 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001119
Benjamin Peterson29060642009-01-31 22:14:21 +00001120 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001121 if (callresults)
1122 PyObject_Free(callresults);
1123 if (abuffer)
1124 PyObject_Free(abuffer);
1125 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1126 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001127 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001128 if (callresults) {
1129 PyObject **callresult2 = callresults;
1130 while (callresult2 < callresult) {
1131 Py_DECREF(*callresult2);
1132 ++callresult2;
1133 }
1134 PyObject_Free(callresults);
1135 }
1136 if (abuffer)
1137 PyObject_Free(abuffer);
1138 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001139}
1140
1141#undef appendstring
1142
1143PyObject *
1144PyUnicode_FromFormat(const char *format, ...)
1145{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001146 PyObject* ret;
1147 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148
1149#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001150 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001151#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001153#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 ret = PyUnicode_FromFormatV(format, vargs);
1155 va_end(vargs);
1156 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001157}
1158
Martin v. Löwis18e16552006-02-15 17:27:45 +00001159Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001160 wchar_t *w,
1161 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162{
1163 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001164 PyErr_BadInternalCall();
1165 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001167
1168 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001170 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001171
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172#ifdef HAVE_USABLE_WCHAR_T
1173 memcpy(w, unicode->str, size * sizeof(wchar_t));
1174#else
1175 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001176 register Py_UNICODE *u;
1177 register Py_ssize_t i;
1178 u = PyUnicode_AS_UNICODE(unicode);
1179 for (i = size; i > 0; i--)
1180 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 }
1182#endif
1183
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001184 if (size > PyUnicode_GET_SIZE(unicode))
1185 return PyUnicode_GET_SIZE(unicode);
1186 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001187 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188}
1189
1190#endif
1191
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001192PyObject *PyUnicode_FromOrdinal(int ordinal)
1193{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001194 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001195
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001196 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001197 PyErr_SetString(PyExc_ValueError,
1198 "chr() arg not in range(0x110000)");
1199 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001201
1202#ifndef Py_UNICODE_WIDE
1203 if (ordinal > 0xffff) {
1204 ordinal -= 0x10000;
1205 s[0] = 0xD800 | (ordinal >> 10);
1206 s[1] = 0xDC00 | (ordinal & 0x3FF);
1207 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
1209#endif
1210
Hye-Shik Chang40574832004-04-06 07:24:51 +00001211 s[0] = (Py_UNICODE)ordinal;
1212 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001213}
1214
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215PyObject *PyUnicode_FromObject(register PyObject *obj)
1216{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001217 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001218 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001219 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001220 Py_INCREF(obj);
1221 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001222 }
1223 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001224 /* For a Unicode subtype that's not a Unicode object,
1225 return a true Unicode object with the same data. */
1226 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1227 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001228 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001229 PyErr_Format(PyExc_TypeError,
1230 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001231 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001232 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001233}
1234
1235PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001236 const char *encoding,
1237 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001238{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001239 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001240 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001242
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 PyErr_BadInternalCall();
1245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001247
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001248 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001249 PyErr_SetString(PyExc_TypeError,
1250 "decoding str is not supported");
1251 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001252 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001253
1254 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001255 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001256 s = PyBytes_AS_STRING(obj);
1257 len = PyBytes_GET_SIZE(obj);
1258 }
1259 else if (PyByteArray_Check(obj)) {
1260 s = PyByteArray_AS_STRING(obj);
1261 len = PyByteArray_GET_SIZE(obj);
1262 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001263 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001264 /* Overwrite the error message with something more useful in
1265 case of a TypeError. */
1266 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001267 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001268 "coercing to str: need string or buffer, "
1269 "%.80s found",
1270 Py_TYPE(obj)->tp_name);
1271 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001272 }
Tim Petersced69f82003-09-16 20:30:58 +00001273
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001274 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 Py_INCREF(unicode_empty);
1277 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278 }
Tim Petersced69f82003-09-16 20:30:58 +00001279 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001280 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 return v;
1283
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286}
1287
1288PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001289 Py_ssize_t size,
1290 const char *encoding,
1291 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292{
1293 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001294 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001295 char lower[20]; /* Enough for any encoding name we recognize */
1296 char *l;
1297 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001298
1299 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001300 encoding = PyUnicode_GetDefaultEncoding();
1301
1302 /* Convert encoding to lower case and replace '_' with '-' in order to
1303 catch e.g. UTF_8 */
1304 e = encoding;
1305 l = lower;
1306 while (*e && l < &lower[(sizeof lower) - 2]) {
1307 if (ISUPPER(*e)) {
1308 *l++ = TOLOWER(*e++);
1309 }
1310 else if (*e == '_') {
1311 *l++ = '-';
1312 e++;
1313 }
1314 else {
1315 *l++ = *e++;
1316 }
1317 }
1318 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001319
1320 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001321 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001323 else if ((strcmp(lower, "latin-1") == 0) ||
1324 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001325 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001326#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001327 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001328 return PyUnicode_DecodeMBCS(s, size, errors);
1329#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001330 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001331 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001332 else if (strcmp(lower, "utf-16") == 0)
1333 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1334 else if (strcmp(lower, "utf-32") == 0)
1335 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336
1337 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001338 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001339 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001340 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001341 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342 if (buffer == NULL)
1343 goto onError;
1344 unicode = PyCodec_Decode(buffer, encoding, errors);
1345 if (unicode == NULL)
1346 goto onError;
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001349 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001350 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 Py_DECREF(unicode);
1352 goto onError;
1353 }
1354 Py_DECREF(buffer);
1355 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001356
Benjamin Peterson29060642009-01-31 22:14:21 +00001357 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 Py_XDECREF(buffer);
1359 return NULL;
1360}
1361
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001362PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1363 const char *encoding,
1364 const char *errors)
1365{
1366 PyObject *v;
1367
1368 if (!PyUnicode_Check(unicode)) {
1369 PyErr_BadArgument();
1370 goto onError;
1371 }
1372
1373 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375
1376 /* Decode via the codec registry */
1377 v = PyCodec_Decode(unicode, encoding, errors);
1378 if (v == NULL)
1379 goto onError;
1380 return v;
1381
Benjamin Peterson29060642009-01-31 22:14:21 +00001382 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001383 return NULL;
1384}
1385
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001386PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1387 const char *encoding,
1388 const char *errors)
1389{
1390 PyObject *v;
1391
1392 if (!PyUnicode_Check(unicode)) {
1393 PyErr_BadArgument();
1394 goto onError;
1395 }
1396
1397 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001398 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001399
1400 /* Decode via the codec registry */
1401 v = PyCodec_Decode(unicode, encoding, errors);
1402 if (v == NULL)
1403 goto onError;
1404 if (!PyUnicode_Check(v)) {
1405 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001406 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001407 Py_TYPE(v)->tp_name);
1408 Py_DECREF(v);
1409 goto onError;
1410 }
1411 return v;
1412
Benjamin Peterson29060642009-01-31 22:14:21 +00001413 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001414 return NULL;
1415}
1416
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001418 Py_ssize_t size,
1419 const char *encoding,
1420 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421{
1422 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001423
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 unicode = PyUnicode_FromUnicode(s, size);
1425 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001427 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1428 Py_DECREF(unicode);
1429 return v;
1430}
1431
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001432PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1433 const char *encoding,
1434 const char *errors)
1435{
1436 PyObject *v;
1437
1438 if (!PyUnicode_Check(unicode)) {
1439 PyErr_BadArgument();
1440 goto onError;
1441 }
1442
1443 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001445
1446 /* Encode via the codec registry */
1447 v = PyCodec_Encode(unicode, encoding, errors);
1448 if (v == NULL)
1449 goto onError;
1450 return v;
1451
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001453 return NULL;
1454}
1455
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1457 const char *encoding,
1458 const char *errors)
1459{
1460 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001461
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462 if (!PyUnicode_Check(unicode)) {
1463 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 }
Fred Drakee4315f52000-05-09 19:53:39 +00001466
Tim Petersced69f82003-09-16 20:30:58 +00001467 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001468 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001469
1470 /* Shortcuts for common default encodings */
1471 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001472 if (strcmp(encoding, "utf-8") == 0)
1473 return PyUnicode_AsUTF8String(unicode);
1474 else if (strcmp(encoding, "latin-1") == 0)
1475 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001476#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001477 else if (strcmp(encoding, "mbcs") == 0)
1478 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001479#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 else if (strcmp(encoding, "ascii") == 0)
1481 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001482 /* During bootstrap, we may need to find the encodings
1483 package, to load the file system encoding, and require the
1484 file system encoding in order to load the encodings
1485 package.
1486
1487 Break out of this dependency by assuming that the path to
1488 the encodings module is ASCII-only. XXX could try wcstombs
1489 instead, if the file system encoding is the locale's
1490 encoding. */
1491 else if (Py_FileSystemDefaultEncoding &&
1492 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1493 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496
1497 /* Encode via the codec registry */
1498 v = PyCodec_Encode(unicode, encoding, errors);
1499 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001500 return NULL;
1501
1502 /* The normal path */
1503 if (PyBytes_Check(v))
1504 return v;
1505
1506 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001507 if (PyByteArray_Check(v)) {
1508 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001509 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001510 PyOS_snprintf(msg, sizeof(msg),
1511 "encoder %s returned buffer instead of bytes",
1512 encoding);
1513 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001514 Py_DECREF(v);
1515 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001516 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001517
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001518 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1519 Py_DECREF(v);
1520 return b;
1521 }
1522
1523 PyErr_Format(PyExc_TypeError,
1524 "encoder did not return a bytes object (type=%.400s)",
1525 Py_TYPE(v)->tp_name);
1526 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001527 return NULL;
1528}
1529
1530PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1531 const char *encoding,
1532 const char *errors)
1533{
1534 PyObject *v;
1535
1536 if (!PyUnicode_Check(unicode)) {
1537 PyErr_BadArgument();
1538 goto onError;
1539 }
1540
1541 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001542 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001543
1544 /* Encode via the codec registry */
1545 v = PyCodec_Encode(unicode, encoding, errors);
1546 if (v == NULL)
1547 goto onError;
1548 if (!PyUnicode_Check(v)) {
1549 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001550 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551 Py_TYPE(v)->tp_name);
1552 Py_DECREF(v);
1553 goto onError;
1554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001556
Benjamin Peterson29060642009-01-31 22:14:21 +00001557 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 return NULL;
1559}
1560
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001561PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001562 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001563{
1564 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001565 if (v)
1566 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001567 if (errors != NULL)
1568 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001569 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001570 PyUnicode_GET_SIZE(unicode),
1571 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001572 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001573 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001574 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001575 return v;
1576}
1577
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001578PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001579PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001580 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001581 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1582}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001583
Christian Heimes5894ba72007-11-04 11:43:14 +00001584PyObject*
1585PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1586{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001587 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1588 can be undefined. If it is case, decode using UTF-8. The following assumes
1589 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1590 bootstrapping process where the codecs aren't ready yet.
1591 */
1592 if (Py_FileSystemDefaultEncoding) {
1593#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001594 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001595 return PyUnicode_DecodeMBCS(s, size, "replace");
1596 }
1597#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001598 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001599 return PyUnicode_DecodeUTF8(s, size, "replace");
1600 }
1601#endif
1602 return PyUnicode_Decode(s, size,
1603 Py_FileSystemDefaultEncoding,
1604 "replace");
1605 }
1606 else {
1607 return PyUnicode_DecodeUTF8(s, size, "replace");
1608 }
1609}
1610
Martin v. Löwis011e8422009-05-05 04:43:17 +00001611/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001612 system encoding. The addr param must be a PyObject**.
1613 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001614
1615int
1616PyUnicode_FSConverter(PyObject* arg, void* addr)
1617{
1618 PyObject *output = NULL;
1619 Py_ssize_t size;
1620 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001621 if (arg == NULL) {
1622 Py_DECREF(*(PyObject**)addr);
1623 return 1;
1624 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001625 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1626 output = arg;
1627 Py_INCREF(output);
1628 }
1629 else {
1630 arg = PyUnicode_FromObject(arg);
1631 if (!arg)
1632 return 0;
1633 output = PyUnicode_AsEncodedObject(arg,
1634 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001635 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001636 Py_DECREF(arg);
1637 if (!output)
1638 return 0;
1639 if (!PyBytes_Check(output)) {
1640 Py_DECREF(output);
1641 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1642 return 0;
1643 }
1644 }
1645 if (PyBytes_Check(output)) {
1646 size = PyBytes_GET_SIZE(output);
1647 data = PyBytes_AS_STRING(output);
1648 }
1649 else {
1650 size = PyByteArray_GET_SIZE(output);
1651 data = PyByteArray_AS_STRING(output);
1652 }
1653 if (size != strlen(data)) {
1654 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1655 Py_DECREF(output);
1656 return 0;
1657 }
1658 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001659 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001660}
1661
1662
Martin v. Löwis5b222132007-06-10 09:51:05 +00001663char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001664_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001665{
Christian Heimesf3863112007-11-22 07:46:41 +00001666 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001667 if (!PyUnicode_Check(unicode)) {
1668 PyErr_BadArgument();
1669 return NULL;
1670 }
Christian Heimesf3863112007-11-22 07:46:41 +00001671 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1672 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001673 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001674 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001675 *psize = PyBytes_GET_SIZE(bytes);
1676 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001677}
1678
1679char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001680_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001681{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001682 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001683}
1684
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1686{
1687 if (!PyUnicode_Check(unicode)) {
1688 PyErr_BadArgument();
1689 goto onError;
1690 }
1691 return PyUnicode_AS_UNICODE(unicode);
1692
Benjamin Peterson29060642009-01-31 22:14:21 +00001693 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 return NULL;
1695}
1696
Martin v. Löwis18e16552006-02-15 17:27:45 +00001697Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698{
1699 if (!PyUnicode_Check(unicode)) {
1700 PyErr_BadArgument();
1701 goto onError;
1702 }
1703 return PyUnicode_GET_SIZE(unicode);
1704
Benjamin Peterson29060642009-01-31 22:14:21 +00001705 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706 return -1;
1707}
1708
Thomas Wouters78890102000-07-22 19:25:51 +00001709const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001710{
1711 return unicode_default_encoding;
1712}
1713
1714int PyUnicode_SetDefaultEncoding(const char *encoding)
1715{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001716 if (strcmp(encoding, unicode_default_encoding) != 0) {
1717 PyErr_Format(PyExc_ValueError,
1718 "Can only set default encoding to %s",
1719 unicode_default_encoding);
1720 return -1;
1721 }
Fred Drakee4315f52000-05-09 19:53:39 +00001722 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001723}
1724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001725/* error handling callback helper:
1726 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001727 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 and adjust various state variables.
1729 return 0 on success, -1 on error
1730*/
1731
1732static
1733int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001734 const char *encoding, const char *reason,
1735 const char **input, const char **inend, Py_ssize_t *startinpos,
1736 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1737 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001739 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001740
1741 PyObject *restuple = NULL;
1742 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001743 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001744 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001745 Py_ssize_t requiredsize;
1746 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001747 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001748 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001749 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 int res = -1;
1751
1752 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001753 *errorHandler = PyCodec_LookupError(errors);
1754 if (*errorHandler == NULL)
1755 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 }
1757
1758 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001759 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001760 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1761 if (*exceptionObject == NULL)
1762 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001763 }
1764 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001765 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1766 goto onError;
1767 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1768 goto onError;
1769 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1770 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001771 }
1772
1773 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1774 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001776 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001777 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 }
1780 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001781 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001782
1783 /* Copy back the bytes variables, which might have been modified by the
1784 callback */
1785 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1786 if (!inputobj)
1787 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001788 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001789 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001790 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001791 *input = PyBytes_AS_STRING(inputobj);
1792 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001793 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001794 /* we can DECREF safely, as the exception has another reference,
1795 so the object won't go away. */
1796 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001797
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001798 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001799 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001800 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1802 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001803 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804
1805 /* need more space? (at least enough for what we
1806 have+the replacement+the rest of the string (starting
1807 at the new input position), so we won't have to check space
1808 when there are no errors in the rest of the string) */
1809 repptr = PyUnicode_AS_UNICODE(repunicode);
1810 repsize = PyUnicode_GET_SIZE(repunicode);
1811 requiredsize = *outpos + repsize + insize-newpos;
1812 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001813 if (requiredsize<2*outsize)
1814 requiredsize = 2*outsize;
1815 if (_PyUnicode_Resize(output, requiredsize) < 0)
1816 goto onError;
1817 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 }
1819 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001820 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001821 Py_UNICODE_COPY(*outptr, repptr, repsize);
1822 *outptr += repsize;
1823 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001824
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 /* we made it! */
1826 res = 0;
1827
Benjamin Peterson29060642009-01-31 22:14:21 +00001828 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829 Py_XDECREF(restuple);
1830 return res;
1831}
1832
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001833/* --- UTF-7 Codec -------------------------------------------------------- */
1834
Antoine Pitrou244651a2009-05-04 18:56:13 +00001835/* See RFC2152 for details. We encode conservatively and decode liberally. */
1836
1837/* Three simple macros defining base-64. */
1838
1839/* Is c a base-64 character? */
1840
1841#define IS_BASE64(c) \
1842 (((c) >= 'A' && (c) <= 'Z') || \
1843 ((c) >= 'a' && (c) <= 'z') || \
1844 ((c) >= '0' && (c) <= '9') || \
1845 (c) == '+' || (c) == '/')
1846
1847/* given that c is a base-64 character, what is its base-64 value? */
1848
1849#define FROM_BASE64(c) \
1850 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1851 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1852 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1853 (c) == '+' ? 62 : 63)
1854
1855/* What is the base-64 character of the bottom 6 bits of n? */
1856
1857#define TO_BASE64(n) \
1858 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1859
1860/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1861 * decoded as itself. We are permissive on decoding; the only ASCII
1862 * byte not decoding to itself is the + which begins a base64
1863 * string. */
1864
1865#define DECODE_DIRECT(c) \
1866 ((c) <= 127 && (c) != '+')
1867
1868/* The UTF-7 encoder treats ASCII characters differently according to
1869 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1870 * the above). See RFC2152. This array identifies these different
1871 * sets:
1872 * 0 : "Set D"
1873 * alphanumeric and '(),-./:?
1874 * 1 : "Set O"
1875 * !"#$%&*;<=>@[]^_`{|}
1876 * 2 : "whitespace"
1877 * ht nl cr sp
1878 * 3 : special (must be base64 encoded)
1879 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1880 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881
Tim Petersced69f82003-09-16 20:30:58 +00001882static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001883char utf7_category[128] = {
1884/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1885 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1886/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1887 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1888/* sp ! " # $ % & ' ( ) * + , - . / */
1889 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1890/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1892/* @ A B C D E F G H I J K L M N O */
1893 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1894/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1896/* ` a b c d e f g h i j k l m n o */
1897 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1898/* p q r s t u v w x y z { | } ~ del */
1899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001900};
1901
Antoine Pitrou244651a2009-05-04 18:56:13 +00001902/* ENCODE_DIRECT: this character should be encoded as itself. The
1903 * answer depends on whether we are encoding set O as itself, and also
1904 * on whether we are encoding whitespace as itself. RFC2152 makes it
1905 * clear that the answers to these questions vary between
1906 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001907
Antoine Pitrou244651a2009-05-04 18:56:13 +00001908#define ENCODE_DIRECT(c, directO, directWS) \
1909 ((c) < 128 && (c) > 0 && \
1910 ((utf7_category[(c)] == 0) || \
1911 (directWS && (utf7_category[(c)] == 2)) || \
1912 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001913
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001915 Py_ssize_t size,
1916 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001917{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001918 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1919}
1920
Antoine Pitrou244651a2009-05-04 18:56:13 +00001921/* The decoder. The only state we preserve is our read position,
1922 * i.e. how many characters we have consumed. So if we end in the
1923 * middle of a shift sequence we have to back off the read position
1924 * and the output to the beginning of the sequence, otherwise we lose
1925 * all the shift state (seen bits, number of bits seen, high
1926 * surrogate). */
1927
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001928PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001929 Py_ssize_t size,
1930 const char *errors,
1931 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001932{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001933 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001934 Py_ssize_t startinpos;
1935 Py_ssize_t endinpos;
1936 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001937 const char *e;
1938 PyUnicodeObject *unicode;
1939 Py_UNICODE *p;
1940 const char *errmsg = "";
1941 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001942 Py_UNICODE *shiftOutStart;
1943 unsigned int base64bits = 0;
1944 unsigned long base64buffer = 0;
1945 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 PyObject *errorHandler = NULL;
1947 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001948
1949 unicode = _PyUnicode_New(size);
1950 if (!unicode)
1951 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001952 if (size == 0) {
1953 if (consumed)
1954 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001956 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957
1958 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001959 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001960 e = s + size;
1961
1962 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001963 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001964 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001965 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001966
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 if (inShift) { /* in a base-64 section */
1968 if (IS_BASE64(ch)) { /* consume a base-64 character */
1969 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1970 base64bits += 6;
1971 s++;
1972 if (base64bits >= 16) {
1973 /* we have enough bits for a UTF-16 value */
1974 Py_UNICODE outCh = (Py_UNICODE)
1975 (base64buffer >> (base64bits-16));
1976 base64bits -= 16;
1977 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1978 if (surrogate) {
1979 /* expecting a second surrogate */
1980 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1981#ifdef Py_UNICODE_WIDE
1982 *p++ = (((surrogate & 0x3FF)<<10)
1983 | (outCh & 0x3FF)) + 0x10000;
1984#else
1985 *p++ = surrogate;
1986 *p++ = outCh;
1987#endif
1988 surrogate = 0;
1989 }
1990 else {
1991 surrogate = 0;
1992 errmsg = "second surrogate missing";
1993 goto utf7Error;
1994 }
1995 }
1996 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1997 /* first surrogate */
1998 surrogate = outCh;
1999 }
2000 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2001 errmsg = "unexpected second surrogate";
2002 goto utf7Error;
2003 }
2004 else {
2005 *p++ = outCh;
2006 }
2007 }
2008 }
2009 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002010 inShift = 0;
2011 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002012 if (surrogate) {
2013 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002014 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002015 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002016 if (base64bits > 0) { /* left-over bits */
2017 if (base64bits >= 6) {
2018 /* We've seen at least one base-64 character */
2019 errmsg = "partial character in shift sequence";
2020 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002021 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002022 else {
2023 /* Some bits remain; they should be zero */
2024 if (base64buffer != 0) {
2025 errmsg = "non-zero padding bits in shift sequence";
2026 goto utf7Error;
2027 }
2028 }
2029 }
2030 if (ch != '-') {
2031 /* '-' is absorbed; other terminating
2032 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002033 *p++ = ch;
2034 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002035 }
2036 }
2037 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002039 s++; /* consume '+' */
2040 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002041 s++;
2042 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002043 }
2044 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002045 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002046 shiftOutStart = p;
2047 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002048 }
2049 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002050 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002051 *p++ = ch;
2052 s++;
2053 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002054 else {
2055 startinpos = s-starts;
2056 s++;
2057 errmsg = "unexpected special character";
2058 goto utf7Error;
2059 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 outpos = p-PyUnicode_AS_UNICODE(unicode);
2063 endinpos = s-starts;
2064 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002065 errors, &errorHandler,
2066 "utf7", errmsg,
2067 &starts, &e, &startinpos, &endinpos, &exc, &s,
2068 &unicode, &outpos, &p))
2069 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002070 }
2071
Antoine Pitrou244651a2009-05-04 18:56:13 +00002072 /* end of string */
2073
2074 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2075 /* if we're in an inconsistent state, that's an error */
2076 if (surrogate ||
2077 (base64bits >= 6) ||
2078 (base64bits > 0 && base64buffer != 0)) {
2079 outpos = p-PyUnicode_AS_UNICODE(unicode);
2080 endinpos = size;
2081 if (unicode_decode_call_errorhandler(
2082 errors, &errorHandler,
2083 "utf7", "unterminated shift sequence",
2084 &starts, &e, &startinpos, &endinpos, &exc, &s,
2085 &unicode, &outpos, &p))
2086 goto onError;
2087 if (s < e)
2088 goto restart;
2089 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002090 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002091
2092 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002093 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002094 if (inShift) {
2095 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002096 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002097 }
2098 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002099 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002100 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002101 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002102
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002103 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002104 goto onError;
2105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002106 Py_XDECREF(errorHandler);
2107 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002108 return (PyObject *)unicode;
2109
Benjamin Peterson29060642009-01-31 22:14:21 +00002110 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111 Py_XDECREF(errorHandler);
2112 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002113 Py_DECREF(unicode);
2114 return NULL;
2115}
2116
2117
2118PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002119 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002120 int base64SetO,
2121 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002122 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002123{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002124 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002125 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002126 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002127 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002128 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002129 unsigned int base64bits = 0;
2130 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002131 char * out;
2132 char * start;
2133
2134 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002135 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002136
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002137 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002138 return PyErr_NoMemory();
2139
Antoine Pitrou244651a2009-05-04 18:56:13 +00002140 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002141 if (v == NULL)
2142 return NULL;
2143
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002144 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002145 for (;i < size; ++i) {
2146 Py_UNICODE ch = s[i];
2147
Antoine Pitrou244651a2009-05-04 18:56:13 +00002148 if (inShift) {
2149 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2150 /* shifting out */
2151 if (base64bits) { /* output remaining bits */
2152 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2153 base64buffer = 0;
2154 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002155 }
2156 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002157 /* Characters not in the BASE64 set implicitly unshift the sequence
2158 so no '-' is required, except if the character is itself a '-' */
2159 if (IS_BASE64(ch) || ch == '-') {
2160 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002161 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002162 *out++ = (char) ch;
2163 }
2164 else {
2165 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002166 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002167 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002168 else { /* not in a shift sequence */
2169 if (ch == '+') {
2170 *out++ = '+';
2171 *out++ = '-';
2172 }
2173 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2174 *out++ = (char) ch;
2175 }
2176 else {
2177 *out++ = '+';
2178 inShift = 1;
2179 goto encode_char;
2180 }
2181 }
2182 continue;
2183encode_char:
2184#ifdef Py_UNICODE_WIDE
2185 if (ch >= 0x10000) {
2186 /* code first surrogate */
2187 base64bits += 16;
2188 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2189 while (base64bits >= 6) {
2190 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2191 base64bits -= 6;
2192 }
2193 /* prepare second surrogate */
2194 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2195 }
2196#endif
2197 base64bits += 16;
2198 base64buffer = (base64buffer << 16) | ch;
2199 while (base64bits >= 6) {
2200 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2201 base64bits -= 6;
2202 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002203 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002204 if (base64bits)
2205 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2206 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002207 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002208 if (_PyBytes_Resize(&v, out - start) < 0)
2209 return NULL;
2210 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002211}
2212
Antoine Pitrou244651a2009-05-04 18:56:13 +00002213#undef IS_BASE64
2214#undef FROM_BASE64
2215#undef TO_BASE64
2216#undef DECODE_DIRECT
2217#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002218
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219/* --- UTF-8 Codec -------------------------------------------------------- */
2220
Tim Petersced69f82003-09-16 20:30:58 +00002221static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222char utf8_code_length[256] = {
2223 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2224 illegal prefix. see RFC 2279 for details */
2225 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2226 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2227 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2228 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2229 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2230 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2232 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2237 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2238 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2239 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2240 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2241};
2242
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002244 Py_ssize_t size,
2245 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246{
Walter Dörwald69652032004-09-07 20:24:22 +00002247 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2248}
2249
Antoine Pitrouab868312009-01-10 15:40:25 +00002250/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2251#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2252
2253/* Mask to quickly check whether a C 'long' contains a
2254 non-ASCII, UTF8-encoded char. */
2255#if (SIZEOF_LONG == 8)
2256# define ASCII_CHAR_MASK 0x8080808080808080L
2257#elif (SIZEOF_LONG == 4)
2258# define ASCII_CHAR_MASK 0x80808080L
2259#else
2260# error C 'long' size should be either 4 or 8!
2261#endif
2262
Walter Dörwald69652032004-09-07 20:24:22 +00002263PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002264 Py_ssize_t size,
2265 const char *errors,
2266 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002270 Py_ssize_t startinpos;
2271 Py_ssize_t endinpos;
2272 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002273 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 PyUnicodeObject *unicode;
2275 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002276 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 PyObject *errorHandler = NULL;
2278 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279
2280 /* Note: size will always be longer than the resulting Unicode
2281 character count */
2282 unicode = _PyUnicode_New(size);
2283 if (!unicode)
2284 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002285 if (size == 0) {
2286 if (consumed)
2287 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002289 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290
2291 /* Unpack UTF-8 encoded data */
2292 p = unicode->str;
2293 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002294 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295
2296 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002297 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298
2299 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002300 /* Fast path for runs of ASCII characters. Given that common UTF-8
2301 input will consist of an overwhelming majority of ASCII
2302 characters, we try to optimize for this case by checking
2303 as many characters as a C 'long' can contain.
2304 First, check if we can do an aligned read, as most CPUs have
2305 a penalty for unaligned reads.
2306 */
2307 if (!((size_t) s & LONG_PTR_MASK)) {
2308 /* Help register allocation */
2309 register const char *_s = s;
2310 register Py_UNICODE *_p = p;
2311 while (_s < aligned_end) {
2312 /* Read a whole long at a time (either 4 or 8 bytes),
2313 and do a fast unrolled copy if it only contains ASCII
2314 characters. */
2315 unsigned long data = *(unsigned long *) _s;
2316 if (data & ASCII_CHAR_MASK)
2317 break;
2318 _p[0] = (unsigned char) _s[0];
2319 _p[1] = (unsigned char) _s[1];
2320 _p[2] = (unsigned char) _s[2];
2321 _p[3] = (unsigned char) _s[3];
2322#if (SIZEOF_LONG == 8)
2323 _p[4] = (unsigned char) _s[4];
2324 _p[5] = (unsigned char) _s[5];
2325 _p[6] = (unsigned char) _s[6];
2326 _p[7] = (unsigned char) _s[7];
2327#endif
2328 _s += SIZEOF_LONG;
2329 _p += SIZEOF_LONG;
2330 }
2331 s = _s;
2332 p = _p;
2333 if (s == e)
2334 break;
2335 ch = (unsigned char)*s;
2336 }
2337 }
2338
2339 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002340 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341 s++;
2342 continue;
2343 }
2344
2345 n = utf8_code_length[ch];
2346
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002347 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002348 if (consumed)
2349 break;
2350 else {
2351 errmsg = "unexpected end of data";
2352 startinpos = s-starts;
2353 endinpos = size;
2354 goto utf8Error;
2355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357
2358 switch (n) {
2359
2360 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002361 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002362 startinpos = s-starts;
2363 endinpos = startinpos+1;
2364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365
2366 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002367 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002368 startinpos = s-starts;
2369 endinpos = startinpos+1;
2370 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371
2372 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002373 if ((s[1] & 0xc0) != 0x80) {
2374 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002375 startinpos = s-starts;
2376 endinpos = startinpos+2;
2377 goto utf8Error;
2378 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002380 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002381 startinpos = s-starts;
2382 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002383 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002384 goto utf8Error;
2385 }
2386 else
2387 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388 break;
2389
2390 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002391 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002392 (s[2] & 0xc0) != 0x80) {
2393 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002394 startinpos = s-starts;
2395 endinpos = startinpos+3;
2396 goto utf8Error;
2397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002399 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002401 startinpos = s-starts;
2402 endinpos = startinpos+3;
2403 goto utf8Error;
2404 }
2405 else
2406 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002407 break;
2408
2409 case 4:
2410 if ((s[1] & 0xc0) != 0x80 ||
2411 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002412 (s[3] & 0xc0) != 0x80) {
2413 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002414 startinpos = s-starts;
2415 endinpos = startinpos+4;
2416 goto utf8Error;
2417 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002418 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002420 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002421 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002422 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002423 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002424 UTF-16 */
2425 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002426 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 startinpos = s-starts;
2428 endinpos = startinpos+4;
2429 goto utf8Error;
2430 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002431#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002432 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002433#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002434 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002435
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002436 /* translate from 10000..10FFFF to 0..FFFF */
2437 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002438
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002439 /* high surrogate = top 10 bits added to D800 */
2440 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002441
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002442 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002443 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002444#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 break;
2446
2447 default:
2448 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002449 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002450 startinpos = s-starts;
2451 endinpos = startinpos+n;
2452 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 }
2454 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002455 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002456
Benjamin Peterson29060642009-01-31 22:14:21 +00002457 utf8Error:
2458 outpos = p-PyUnicode_AS_UNICODE(unicode);
2459 if (unicode_decode_call_errorhandler(
2460 errors, &errorHandler,
2461 "utf8", errmsg,
2462 &starts, &e, &startinpos, &endinpos, &exc, &s,
2463 &unicode, &outpos, &p))
2464 goto onError;
2465 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 }
Walter Dörwald69652032004-09-07 20:24:22 +00002467 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002468 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469
2470 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002471 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 goto onError;
2473
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002474 Py_XDECREF(errorHandler);
2475 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 return (PyObject *)unicode;
2477
Benjamin Peterson29060642009-01-31 22:14:21 +00002478 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 Py_XDECREF(errorHandler);
2480 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 Py_DECREF(unicode);
2482 return NULL;
2483}
2484
Antoine Pitrouab868312009-01-10 15:40:25 +00002485#undef ASCII_CHAR_MASK
2486
2487
Tim Peters602f7402002-04-27 18:03:26 +00002488/* Allocation strategy: if the string is short, convert into a stack buffer
2489 and allocate exactly as much space needed at the end. Else allocate the
2490 maximum possible needed (4 result bytes per Unicode character), and return
2491 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002492*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002493PyObject *
2494PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002495 Py_ssize_t size,
2496 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497{
Tim Peters602f7402002-04-27 18:03:26 +00002498#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002499
Guido van Rossum98297ee2007-11-06 21:34:58 +00002500 Py_ssize_t i; /* index into s of next input byte */
2501 PyObject *result; /* result string object */
2502 char *p; /* next free byte in output buffer */
2503 Py_ssize_t nallocated; /* number of result bytes allocated */
2504 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002505 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002506 PyObject *errorHandler = NULL;
2507 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002508
Tim Peters602f7402002-04-27 18:03:26 +00002509 assert(s != NULL);
2510 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511
Tim Peters602f7402002-04-27 18:03:26 +00002512 if (size <= MAX_SHORT_UNICHARS) {
2513 /* Write into the stack buffer; nallocated can't overflow.
2514 * At the end, we'll allocate exactly as much heap space as it
2515 * turns out we need.
2516 */
2517 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002518 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002519 p = stackbuf;
2520 }
2521 else {
2522 /* Overallocate on the heap, and give the excess back at the end. */
2523 nallocated = size * 4;
2524 if (nallocated / 4 != size) /* overflow! */
2525 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002526 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002527 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002528 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002529 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002530 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002531
Tim Peters602f7402002-04-27 18:03:26 +00002532 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002533 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002534
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002535 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002536 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002538
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002540 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002541 *p++ = (char)(0xc0 | (ch >> 6));
2542 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002543 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002544 else {
Tim Peters602f7402002-04-27 18:03:26 +00002545 /* Encode UCS2 Unicode ordinals */
2546 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002547#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002548 /* Special case: check for high surrogate */
2549 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2550 Py_UCS4 ch2 = s[i];
2551 /* Check for low surrogate and combine the two to
2552 form a UCS4 value */
2553 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002554 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002555 i++;
2556 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002557 }
Tim Peters602f7402002-04-27 18:03:26 +00002558 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002559 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002560#endif
2561 if (ch >= 0xd800 && ch <= 0xdfff) {
2562 Py_ssize_t newpos;
2563 PyObject *rep;
2564 char *prep;
2565 int k;
2566 rep = unicode_encode_call_errorhandler
2567 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2568 s, size, &exc, i-1, i, &newpos);
2569 if (!rep)
2570 goto error;
2571 /* Implementation limitations: only support error handler that return
2572 bytes, and only support up to four replacement bytes. */
2573 if (!PyBytes_Check(rep)) {
2574 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2575 Py_DECREF(rep);
2576 goto error;
2577 }
2578 if (PyBytes_Size(rep) > 4) {
2579 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2580 Py_DECREF(rep);
2581 goto error;
2582 }
2583 prep = PyBytes_AsString(rep);
2584 for(k = PyBytes_Size(rep); k > 0; k--)
2585 *p++ = *prep++;
2586 Py_DECREF(rep);
2587 continue;
2588
2589 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002590 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002591 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2592 *p++ = (char)(0x80 | (ch & 0x3f));
2593 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 }
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002595#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002596 encodeUCS4:
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002597#endif
Tim Peters602f7402002-04-27 18:03:26 +00002598 /* Encode UCS4 Unicode ordinals */
2599 *p++ = (char)(0xf0 | (ch >> 18));
2600 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2601 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2602 *p++ = (char)(0x80 | (ch & 0x3f));
2603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002605
Guido van Rossum98297ee2007-11-06 21:34:58 +00002606 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002607 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002608 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002609 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002610 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002611 }
2612 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002613 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002614 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002615 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002616 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002617 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002618 Py_XDECREF(errorHandler);
2619 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002620 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002621 error:
2622 Py_XDECREF(errorHandler);
2623 Py_XDECREF(exc);
2624 Py_XDECREF(result);
2625 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002626
Tim Peters602f7402002-04-27 18:03:26 +00002627#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628}
2629
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2631{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 if (!PyUnicode_Check(unicode)) {
2633 PyErr_BadArgument();
2634 return NULL;
2635 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002636 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002637 PyUnicode_GET_SIZE(unicode),
2638 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639}
2640
Walter Dörwald41980ca2007-08-16 21:55:45 +00002641/* --- UTF-32 Codec ------------------------------------------------------- */
2642
2643PyObject *
2644PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002645 Py_ssize_t size,
2646 const char *errors,
2647 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002648{
2649 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2650}
2651
2652PyObject *
2653PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002654 Py_ssize_t size,
2655 const char *errors,
2656 int *byteorder,
2657 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002658{
2659 const char *starts = s;
2660 Py_ssize_t startinpos;
2661 Py_ssize_t endinpos;
2662 Py_ssize_t outpos;
2663 PyUnicodeObject *unicode;
2664 Py_UNICODE *p;
2665#ifndef Py_UNICODE_WIDE
2666 int i, pairs;
2667#else
2668 const int pairs = 0;
2669#endif
2670 const unsigned char *q, *e;
2671 int bo = 0; /* assume native ordering by default */
2672 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002673 /* Offsets from q for retrieving bytes in the right order. */
2674#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2675 int iorder[] = {0, 1, 2, 3};
2676#else
2677 int iorder[] = {3, 2, 1, 0};
2678#endif
2679 PyObject *errorHandler = NULL;
2680 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002681 /* On narrow builds we split characters outside the BMP into two
2682 codepoints => count how much extra space we need. */
2683#ifndef Py_UNICODE_WIDE
2684 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 if (((Py_UCS4 *)s)[i] >= 0x10000)
2686 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002687#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002688
2689 /* This might be one to much, because of a BOM */
2690 unicode = _PyUnicode_New((size+3)/4+pairs);
2691 if (!unicode)
2692 return NULL;
2693 if (size == 0)
2694 return (PyObject *)unicode;
2695
2696 /* Unpack UTF-32 encoded data */
2697 p = unicode->str;
2698 q = (unsigned char *)s;
2699 e = q + size;
2700
2701 if (byteorder)
2702 bo = *byteorder;
2703
2704 /* Check for BOM marks (U+FEFF) in the input and adjust current
2705 byte order setting accordingly. In native mode, the leading BOM
2706 mark is skipped, in all other modes, it is copied to the output
2707 stream as-is (giving a ZWNBSP character). */
2708 if (bo == 0) {
2709 if (size >= 4) {
2710 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002712#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002713 if (bom == 0x0000FEFF) {
2714 q += 4;
2715 bo = -1;
2716 }
2717 else if (bom == 0xFFFE0000) {
2718 q += 4;
2719 bo = 1;
2720 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002721#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002722 if (bom == 0x0000FEFF) {
2723 q += 4;
2724 bo = 1;
2725 }
2726 else if (bom == 0xFFFE0000) {
2727 q += 4;
2728 bo = -1;
2729 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002730#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002731 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002732 }
2733
2734 if (bo == -1) {
2735 /* force LE */
2736 iorder[0] = 0;
2737 iorder[1] = 1;
2738 iorder[2] = 2;
2739 iorder[3] = 3;
2740 }
2741 else if (bo == 1) {
2742 /* force BE */
2743 iorder[0] = 3;
2744 iorder[1] = 2;
2745 iorder[2] = 1;
2746 iorder[3] = 0;
2747 }
2748
2749 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002750 Py_UCS4 ch;
2751 /* remaining bytes at the end? (size should be divisible by 4) */
2752 if (e-q<4) {
2753 if (consumed)
2754 break;
2755 errmsg = "truncated data";
2756 startinpos = ((const char *)q)-starts;
2757 endinpos = ((const char *)e)-starts;
2758 goto utf32Error;
2759 /* The remaining input chars are ignored if the callback
2760 chooses to skip the input */
2761 }
2762 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2763 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002764
Benjamin Peterson29060642009-01-31 22:14:21 +00002765 if (ch >= 0x110000)
2766 {
2767 errmsg = "codepoint not in range(0x110000)";
2768 startinpos = ((const char *)q)-starts;
2769 endinpos = startinpos+4;
2770 goto utf32Error;
2771 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002772#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002773 if (ch >= 0x10000)
2774 {
2775 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2776 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2777 }
2778 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002779#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002780 *p++ = ch;
2781 q += 4;
2782 continue;
2783 utf32Error:
2784 outpos = p-PyUnicode_AS_UNICODE(unicode);
2785 if (unicode_decode_call_errorhandler(
2786 errors, &errorHandler,
2787 "utf32", errmsg,
2788 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2789 &unicode, &outpos, &p))
2790 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002791 }
2792
2793 if (byteorder)
2794 *byteorder = bo;
2795
2796 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002798
2799 /* Adjust length */
2800 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2801 goto onError;
2802
2803 Py_XDECREF(errorHandler);
2804 Py_XDECREF(exc);
2805 return (PyObject *)unicode;
2806
Benjamin Peterson29060642009-01-31 22:14:21 +00002807 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002808 Py_DECREF(unicode);
2809 Py_XDECREF(errorHandler);
2810 Py_XDECREF(exc);
2811 return NULL;
2812}
2813
2814PyObject *
2815PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002816 Py_ssize_t size,
2817 const char *errors,
2818 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002819{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002820 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002821 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002822 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002823#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002824 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002825#else
2826 const int pairs = 0;
2827#endif
2828 /* Offsets from p for storing byte pairs in the right order. */
2829#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2830 int iorder[] = {0, 1, 2, 3};
2831#else
2832 int iorder[] = {3, 2, 1, 0};
2833#endif
2834
Benjamin Peterson29060642009-01-31 22:14:21 +00002835#define STORECHAR(CH) \
2836 do { \
2837 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2838 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2839 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2840 p[iorder[0]] = (CH) & 0xff; \
2841 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002842 } while(0)
2843
2844 /* In narrow builds we can output surrogate pairs as one codepoint,
2845 so we need less space. */
2846#ifndef Py_UNICODE_WIDE
2847 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002848 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2849 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2850 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002851#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002852 nsize = (size - pairs + (byteorder == 0));
2853 bytesize = nsize * 4;
2854 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002856 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002857 if (v == NULL)
2858 return NULL;
2859
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002860 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002861 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002862 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002863 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002864 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002865
2866 if (byteorder == -1) {
2867 /* force LE */
2868 iorder[0] = 0;
2869 iorder[1] = 1;
2870 iorder[2] = 2;
2871 iorder[3] = 3;
2872 }
2873 else if (byteorder == 1) {
2874 /* force BE */
2875 iorder[0] = 3;
2876 iorder[1] = 2;
2877 iorder[2] = 1;
2878 iorder[3] = 0;
2879 }
2880
2881 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002882 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002883#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2885 Py_UCS4 ch2 = *s;
2886 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2887 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2888 s++;
2889 size--;
2890 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002891 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002892#endif
2893 STORECHAR(ch);
2894 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002895
2896 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002897 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002898#undef STORECHAR
2899}
2900
2901PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2902{
2903 if (!PyUnicode_Check(unicode)) {
2904 PyErr_BadArgument();
2905 return NULL;
2906 }
2907 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002908 PyUnicode_GET_SIZE(unicode),
2909 NULL,
2910 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002911}
2912
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913/* --- UTF-16 Codec ------------------------------------------------------- */
2914
Tim Peters772747b2001-08-09 22:21:55 +00002915PyObject *
2916PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002917 Py_ssize_t size,
2918 const char *errors,
2919 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920{
Walter Dörwald69652032004-09-07 20:24:22 +00002921 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2922}
2923
Antoine Pitrouab868312009-01-10 15:40:25 +00002924/* Two masks for fast checking of whether a C 'long' may contain
2925 UTF16-encoded surrogate characters. This is an efficient heuristic,
2926 assuming that non-surrogate characters with a code point >= 0x8000 are
2927 rare in most input.
2928 FAST_CHAR_MASK is used when the input is in native byte ordering,
2929 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002930*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002931#if (SIZEOF_LONG == 8)
2932# define FAST_CHAR_MASK 0x8000800080008000L
2933# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2934#elif (SIZEOF_LONG == 4)
2935# define FAST_CHAR_MASK 0x80008000L
2936# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2937#else
2938# error C 'long' size should be either 4 or 8!
2939#endif
2940
Walter Dörwald69652032004-09-07 20:24:22 +00002941PyObject *
2942PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002943 Py_ssize_t size,
2944 const char *errors,
2945 int *byteorder,
2946 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002947{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002948 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002949 Py_ssize_t startinpos;
2950 Py_ssize_t endinpos;
2951 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 PyUnicodeObject *unicode;
2953 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002954 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002955 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002956 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002957 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002958 /* Offsets from q for retrieving byte pairs in the right order. */
2959#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2960 int ihi = 1, ilo = 0;
2961#else
2962 int ihi = 0, ilo = 1;
2963#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002964 PyObject *errorHandler = NULL;
2965 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
2967 /* Note: size will always be longer than the resulting Unicode
2968 character count */
2969 unicode = _PyUnicode_New(size);
2970 if (!unicode)
2971 return NULL;
2972 if (size == 0)
2973 return (PyObject *)unicode;
2974
2975 /* Unpack UTF-16 encoded data */
2976 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002977 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002978 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979
2980 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002981 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002983 /* Check for BOM marks (U+FEFF) in the input and adjust current
2984 byte order setting accordingly. In native mode, the leading BOM
2985 mark is skipped, in all other modes, it is copied to the output
2986 stream as-is (giving a ZWNBSP character). */
2987 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002988 if (size >= 2) {
2989 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002990#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 if (bom == 0xFEFF) {
2992 q += 2;
2993 bo = -1;
2994 }
2995 else if (bom == 0xFFFE) {
2996 q += 2;
2997 bo = 1;
2998 }
Tim Petersced69f82003-09-16 20:30:58 +00002999#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003000 if (bom == 0xFEFF) {
3001 q += 2;
3002 bo = 1;
3003 }
3004 else if (bom == 0xFFFE) {
3005 q += 2;
3006 bo = -1;
3007 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003008#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003009 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011
Tim Peters772747b2001-08-09 22:21:55 +00003012 if (bo == -1) {
3013 /* force LE */
3014 ihi = 1;
3015 ilo = 0;
3016 }
3017 else if (bo == 1) {
3018 /* force BE */
3019 ihi = 0;
3020 ilo = 1;
3021 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003022#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3023 native_ordering = ilo < ihi;
3024#else
3025 native_ordering = ilo > ihi;
3026#endif
Tim Peters772747b2001-08-09 22:21:55 +00003027
Antoine Pitrouab868312009-01-10 15:40:25 +00003028 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003029 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003031 /* First check for possible aligned read of a C 'long'. Unaligned
3032 reads are more expensive, better to defer to another iteration. */
3033 if (!((size_t) q & LONG_PTR_MASK)) {
3034 /* Fast path for runs of non-surrogate chars. */
3035 register const unsigned char *_q = q;
3036 Py_UNICODE *_p = p;
3037 if (native_ordering) {
3038 /* Native ordering is simple: as long as the input cannot
3039 possibly contain a surrogate char, do an unrolled copy
3040 of several 16-bit code points to the target object.
3041 The non-surrogate check is done on several input bytes
3042 at a time (as many as a C 'long' can contain). */
3043 while (_q < aligned_end) {
3044 unsigned long data = * (unsigned long *) _q;
3045 if (data & FAST_CHAR_MASK)
3046 break;
3047 _p[0] = ((unsigned short *) _q)[0];
3048 _p[1] = ((unsigned short *) _q)[1];
3049#if (SIZEOF_LONG == 8)
3050 _p[2] = ((unsigned short *) _q)[2];
3051 _p[3] = ((unsigned short *) _q)[3];
3052#endif
3053 _q += SIZEOF_LONG;
3054 _p += SIZEOF_LONG / 2;
3055 }
3056 }
3057 else {
3058 /* Byteswapped ordering is similar, but we must decompose
3059 the copy bytewise, and take care of zero'ing out the
3060 upper bytes if the target object is in 32-bit units
3061 (that is, in UCS-4 builds). */
3062 while (_q < aligned_end) {
3063 unsigned long data = * (unsigned long *) _q;
3064 if (data & SWAPPED_FAST_CHAR_MASK)
3065 break;
3066 /* Zero upper bytes in UCS-4 builds */
3067#if (Py_UNICODE_SIZE > 2)
3068 _p[0] = 0;
3069 _p[1] = 0;
3070#if (SIZEOF_LONG == 8)
3071 _p[2] = 0;
3072 _p[3] = 0;
3073#endif
3074#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003075 /* Issue #4916; UCS-4 builds on big endian machines must
3076 fill the two last bytes of each 4-byte unit. */
3077#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3078# define OFF 2
3079#else
3080# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003081#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003082 ((unsigned char *) _p)[OFF + 1] = _q[0];
3083 ((unsigned char *) _p)[OFF + 0] = _q[1];
3084 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3085 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3086#if (SIZEOF_LONG == 8)
3087 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3088 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3089 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3090 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3091#endif
3092#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003093 _q += SIZEOF_LONG;
3094 _p += SIZEOF_LONG / 2;
3095 }
3096 }
3097 p = _p;
3098 q = _q;
3099 if (q >= e)
3100 break;
3101 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003102 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103
Benjamin Peterson14339b62009-01-31 16:36:08 +00003104 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003105
3106 if (ch < 0xD800 || ch > 0xDFFF) {
3107 *p++ = ch;
3108 continue;
3109 }
3110
3111 /* UTF-16 code pair: */
3112 if (q > e) {
3113 errmsg = "unexpected end of data";
3114 startinpos = (((const char *)q) - 2) - starts;
3115 endinpos = ((const char *)e) + 1 - starts;
3116 goto utf16Error;
3117 }
3118 if (0xD800 <= ch && ch <= 0xDBFF) {
3119 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3120 q += 2;
3121 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003122#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003123 *p++ = ch;
3124 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003125#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003126 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003127#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003128 continue;
3129 }
3130 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003131 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003132 startinpos = (((const char *)q)-4)-starts;
3133 endinpos = startinpos+2;
3134 goto utf16Error;
3135 }
3136
Benjamin Peterson14339b62009-01-31 16:36:08 +00003137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003138 errmsg = "illegal encoding";
3139 startinpos = (((const char *)q)-2)-starts;
3140 endinpos = startinpos+2;
3141 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003142
Benjamin Peterson29060642009-01-31 22:14:21 +00003143 utf16Error:
3144 outpos = p - PyUnicode_AS_UNICODE(unicode);
3145 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003146 errors,
3147 &errorHandler,
3148 "utf16", errmsg,
3149 &starts,
3150 (const char **)&e,
3151 &startinpos,
3152 &endinpos,
3153 &exc,
3154 (const char **)&q,
3155 &unicode,
3156 &outpos,
3157 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003158 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003160 /* remaining byte at the end? (size should be even) */
3161 if (e == q) {
3162 if (!consumed) {
3163 errmsg = "truncated data";
3164 startinpos = ((const char *)q) - starts;
3165 endinpos = ((const char *)e) + 1 - starts;
3166 outpos = p - PyUnicode_AS_UNICODE(unicode);
3167 if (unicode_decode_call_errorhandler(
3168 errors,
3169 &errorHandler,
3170 "utf16", errmsg,
3171 &starts,
3172 (const char **)&e,
3173 &startinpos,
3174 &endinpos,
3175 &exc,
3176 (const char **)&q,
3177 &unicode,
3178 &outpos,
3179 &p))
3180 goto onError;
3181 /* The remaining input chars are ignored if the callback
3182 chooses to skip the input */
3183 }
3184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185
3186 if (byteorder)
3187 *byteorder = bo;
3188
Walter Dörwald69652032004-09-07 20:24:22 +00003189 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003190 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003191
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003193 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 goto onError;
3195
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 Py_XDECREF(errorHandler);
3197 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 return (PyObject *)unicode;
3199
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003202 Py_XDECREF(errorHandler);
3203 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 return NULL;
3205}
3206
Antoine Pitrouab868312009-01-10 15:40:25 +00003207#undef FAST_CHAR_MASK
3208#undef SWAPPED_FAST_CHAR_MASK
3209
Tim Peters772747b2001-08-09 22:21:55 +00003210PyObject *
3211PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003212 Py_ssize_t size,
3213 const char *errors,
3214 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003216 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003217 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003218 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003219#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003220 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003221#else
3222 const int pairs = 0;
3223#endif
Tim Peters772747b2001-08-09 22:21:55 +00003224 /* Offsets from p for storing byte pairs in the right order. */
3225#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3226 int ihi = 1, ilo = 0;
3227#else
3228 int ihi = 0, ilo = 1;
3229#endif
3230
Benjamin Peterson29060642009-01-31 22:14:21 +00003231#define STORECHAR(CH) \
3232 do { \
3233 p[ihi] = ((CH) >> 8) & 0xff; \
3234 p[ilo] = (CH) & 0xff; \
3235 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003236 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003238#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003239 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 if (s[i] >= 0x10000)
3241 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003242#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003243 /* 2 * (size + pairs + (byteorder == 0)) */
3244 if (size > PY_SSIZE_T_MAX ||
3245 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003247 nsize = size + pairs + (byteorder == 0);
3248 bytesize = nsize * 2;
3249 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003251 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 if (v == NULL)
3253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003255 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003258 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003259 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003260
3261 if (byteorder == -1) {
3262 /* force LE */
3263 ihi = 1;
3264 ilo = 0;
3265 }
3266 else if (byteorder == 1) {
3267 /* force BE */
3268 ihi = 0;
3269 ilo = 1;
3270 }
3271
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003272 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 Py_UNICODE ch = *s++;
3274 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003275#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003276 if (ch >= 0x10000) {
3277 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3278 ch = 0xD800 | ((ch-0x10000) >> 10);
3279 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003280#endif
Tim Peters772747b2001-08-09 22:21:55 +00003281 STORECHAR(ch);
3282 if (ch2)
3283 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003284 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003285
3286 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003287 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003288#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289}
3290
3291PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3292{
3293 if (!PyUnicode_Check(unicode)) {
3294 PyErr_BadArgument();
3295 return NULL;
3296 }
3297 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 PyUnicode_GET_SIZE(unicode),
3299 NULL,
3300 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301}
3302
3303/* --- Unicode Escape Codec ----------------------------------------------- */
3304
Fredrik Lundh06d12682001-01-24 07:59:11 +00003305static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003306
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003308 Py_ssize_t size,
3309 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003312 Py_ssize_t startinpos;
3313 Py_ssize_t endinpos;
3314 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003319 char* message;
3320 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 PyObject *errorHandler = NULL;
3322 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003323
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324 /* Escaped strings will always be longer than the resulting
3325 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 length after conversion to the true value.
3327 (but if the error callback returns a long replacement string
3328 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329 v = _PyUnicode_New(size);
3330 if (v == NULL)
3331 goto onError;
3332 if (size == 0)
3333 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003334
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003337
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 while (s < end) {
3339 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003340 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342
3343 /* Non-escape characters are interpreted as Unicode ordinals */
3344 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003345 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 continue;
3347 }
3348
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 /* \ - Escapes */
3351 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003352 c = *s++;
3353 if (s > end)
3354 c = '\0'; /* Invalid after \ */
3355 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356
Benjamin Peterson29060642009-01-31 22:14:21 +00003357 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 case '\n': break;
3359 case '\\': *p++ = '\\'; break;
3360 case '\'': *p++ = '\''; break;
3361 case '\"': *p++ = '\"'; break;
3362 case 'b': *p++ = '\b'; break;
3363 case 'f': *p++ = '\014'; break; /* FF */
3364 case 't': *p++ = '\t'; break;
3365 case 'n': *p++ = '\n'; break;
3366 case 'r': *p++ = '\r'; break;
3367 case 'v': *p++ = '\013'; break; /* VT */
3368 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3369
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 case '0': case '1': case '2': case '3':
3372 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003373 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003374 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003375 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003376 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003377 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003379 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 break;
3381
Benjamin Peterson29060642009-01-31 22:14:21 +00003382 /* hex escapes */
3383 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003385 digits = 2;
3386 message = "truncated \\xXX escape";
3387 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388
Benjamin Peterson29060642009-01-31 22:14:21 +00003389 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003391 digits = 4;
3392 message = "truncated \\uXXXX escape";
3393 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003396 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003397 digits = 8;
3398 message = "truncated \\UXXXXXXXX escape";
3399 hexescape:
3400 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 outpos = p-PyUnicode_AS_UNICODE(v);
3402 if (s+digits>end) {
3403 endinpos = size;
3404 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003405 errors, &errorHandler,
3406 "unicodeescape", "end of string in escape sequence",
3407 &starts, &end, &startinpos, &endinpos, &exc, &s,
3408 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 goto onError;
3410 goto nextByte;
3411 }
3412 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003413 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003414 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 endinpos = (s+i+1)-starts;
3416 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003417 errors, &errorHandler,
3418 "unicodeescape", message,
3419 &starts, &end, &startinpos, &endinpos, &exc, &s,
3420 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003421 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003423 }
3424 chr = (chr<<4) & ~0xF;
3425 if (c >= '0' && c <= '9')
3426 chr += c - '0';
3427 else if (c >= 'a' && c <= 'f')
3428 chr += 10 + c - 'a';
3429 else
3430 chr += 10 + c - 'A';
3431 }
3432 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003433 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 /* _decoding_error will have already written into the
3435 target buffer. */
3436 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003437 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003438 /* when we get here, chr is a 32-bit unicode character */
3439 if (chr <= 0xffff)
3440 /* UCS-2 character */
3441 *p++ = (Py_UNICODE) chr;
3442 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003443 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003444 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003445#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003446 *p++ = chr;
3447#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003448 chr -= 0x10000L;
3449 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003450 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003451#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003452 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 endinpos = s-starts;
3454 outpos = p-PyUnicode_AS_UNICODE(v);
3455 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003456 errors, &errorHandler,
3457 "unicodeescape", "illegal Unicode character",
3458 &starts, &end, &startinpos, &endinpos, &exc, &s,
3459 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003460 goto onError;
3461 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003462 break;
3463
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003465 case 'N':
3466 message = "malformed \\N character escape";
3467 if (ucnhash_CAPI == NULL) {
3468 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003469 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003470 if (ucnhash_CAPI == NULL)
3471 goto ucnhashError;
3472 }
3473 if (*s == '{') {
3474 const char *start = s+1;
3475 /* look for the closing brace */
3476 while (*s != '}' && s < end)
3477 s++;
3478 if (s > start && s < end && *s == '}') {
3479 /* found a name. look it up in the unicode database */
3480 message = "unknown Unicode character name";
3481 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003482 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003483 goto store;
3484 }
3485 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 endinpos = s-starts;
3487 outpos = p-PyUnicode_AS_UNICODE(v);
3488 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003489 errors, &errorHandler,
3490 "unicodeescape", message,
3491 &starts, &end, &startinpos, &endinpos, &exc, &s,
3492 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003493 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003494 break;
3495
3496 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003497 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 message = "\\ at end of string";
3499 s--;
3500 endinpos = s-starts;
3501 outpos = p-PyUnicode_AS_UNICODE(v);
3502 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 errors, &errorHandler,
3504 "unicodeescape", message,
3505 &starts, &end, &startinpos, &endinpos, &exc, &s,
3506 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003507 goto onError;
3508 }
3509 else {
3510 *p++ = '\\';
3511 *p++ = (unsigned char)s[-1];
3512 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003513 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003515 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003518 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003520 Py_XDECREF(errorHandler);
3521 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003523
Benjamin Peterson29060642009-01-31 22:14:21 +00003524 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003525 PyErr_SetString(
3526 PyExc_UnicodeError,
3527 "\\N escapes not supported (can't load unicodedata module)"
3528 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003529 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 Py_XDECREF(errorHandler);
3531 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003532 return NULL;
3533
Benjamin Peterson29060642009-01-31 22:14:21 +00003534 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 Py_XDECREF(errorHandler);
3537 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 return NULL;
3539}
3540
3541/* Return a Unicode-Escape string version of the Unicode object.
3542
3543 If quotes is true, the string is enclosed in u"" or u'' quotes as
3544 appropriate.
3545
3546*/
3547
Thomas Wouters477c8d52006-05-27 19:21:47 +00003548Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003549 Py_ssize_t size,
3550 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003551{
3552 /* like wcschr, but doesn't stop at NULL characters */
3553
3554 while (size-- > 0) {
3555 if (*s == ch)
3556 return s;
3557 s++;
3558 }
3559
3560 return NULL;
3561}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003562
Walter Dörwald79e913e2007-05-12 11:08:06 +00003563static const char *hexdigits = "0123456789abcdef";
3564
3565PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003566 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003568 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003571#ifdef Py_UNICODE_WIDE
3572 const Py_ssize_t expandsize = 10;
3573#else
3574 const Py_ssize_t expandsize = 6;
3575#endif
3576
Thomas Wouters89f507f2006-12-13 04:49:30 +00003577 /* XXX(nnorwitz): rather than over-allocating, it would be
3578 better to choose a different scheme. Perhaps scan the
3579 first N-chars of the string and allocate based on that size.
3580 */
3581 /* Initial allocation is based on the longest-possible unichr
3582 escape.
3583
3584 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3585 unichr, so in this case it's the longest unichr escape. In
3586 narrow (UTF-16) builds this is five chars per source unichr
3587 since there are two unichrs in the surrogate pair, so in narrow
3588 (UTF-16) builds it's not the longest unichr escape.
3589
3590 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3591 so in the narrow (UTF-16) build case it's the longest unichr
3592 escape.
3593 */
3594
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003595 if (size == 0)
3596 return PyBytes_FromStringAndSize(NULL, 0);
3597
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003598 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003600
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003601 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 2
3603 + expandsize*size
3604 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605 if (repr == NULL)
3606 return NULL;
3607
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003608 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 while (size-- > 0) {
3611 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003612
Walter Dörwald79e913e2007-05-12 11:08:06 +00003613 /* Escape backslashes */
3614 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 *p++ = '\\';
3616 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003617 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003618 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003619
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003620#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003621 /* Map 21-bit characters to '\U00xxxxxx' */
3622 else if (ch >= 0x10000) {
3623 *p++ = '\\';
3624 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003625 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3626 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3627 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3628 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3629 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3630 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3631 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3632 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003634 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003635#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3637 else if (ch >= 0xD800 && ch < 0xDC00) {
3638 Py_UNICODE ch2;
3639 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003640
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 ch2 = *s++;
3642 size--;
3643 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3644 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3645 *p++ = '\\';
3646 *p++ = 'U';
3647 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3648 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3649 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3650 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3651 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3652 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3653 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3654 *p++ = hexdigits[ucs & 0x0000000F];
3655 continue;
3656 }
3657 /* Fall through: isolated surrogates are copied as-is */
3658 s--;
3659 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003660 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003661#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003662
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003664 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 *p++ = '\\';
3666 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003667 *p++ = hexdigits[(ch >> 12) & 0x000F];
3668 *p++ = hexdigits[(ch >> 8) & 0x000F];
3669 *p++ = hexdigits[(ch >> 4) & 0x000F];
3670 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003672
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003673 /* Map special whitespace to '\t', \n', '\r' */
3674 else if (ch == '\t') {
3675 *p++ = '\\';
3676 *p++ = 't';
3677 }
3678 else if (ch == '\n') {
3679 *p++ = '\\';
3680 *p++ = 'n';
3681 }
3682 else if (ch == '\r') {
3683 *p++ = '\\';
3684 *p++ = 'r';
3685 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003686
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003687 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003688 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003690 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003691 *p++ = hexdigits[(ch >> 4) & 0x000F];
3692 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003693 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 /* Copy everything else as-is */
3696 else
3697 *p++ = (char) ch;
3698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003700 assert(p - PyBytes_AS_STRING(repr) > 0);
3701 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3702 return NULL;
3703 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704}
3705
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003706PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003708 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 if (!PyUnicode_Check(unicode)) {
3710 PyErr_BadArgument();
3711 return NULL;
3712 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003713 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3714 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003715 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716}
3717
3718/* --- Raw Unicode Escape Codec ------------------------------------------- */
3719
3720PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 Py_ssize_t size,
3722 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003725 Py_ssize_t startinpos;
3726 Py_ssize_t endinpos;
3727 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 const char *end;
3731 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 PyObject *errorHandler = NULL;
3733 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003734
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 /* Escaped strings will always be longer than the resulting
3736 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 length after conversion to the true value. (But decoding error
3738 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739 v = _PyUnicode_New(size);
3740 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003741 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003743 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 end = s + size;
3746 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003747 unsigned char c;
3748 Py_UCS4 x;
3749 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003750 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751
Benjamin Peterson29060642009-01-31 22:14:21 +00003752 /* Non-escape characters are interpreted as Unicode ordinals */
3753 if (*s != '\\') {
3754 *p++ = (unsigned char)*s++;
3755 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003756 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 startinpos = s-starts;
3758
3759 /* \u-escapes are only interpreted iff the number of leading
3760 backslashes if odd */
3761 bs = s;
3762 for (;s < end;) {
3763 if (*s != '\\')
3764 break;
3765 *p++ = (unsigned char)*s++;
3766 }
3767 if (((s - bs) & 1) == 0 ||
3768 s >= end ||
3769 (*s != 'u' && *s != 'U')) {
3770 continue;
3771 }
3772 p--;
3773 count = *s=='u' ? 4 : 8;
3774 s++;
3775
3776 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3777 outpos = p-PyUnicode_AS_UNICODE(v);
3778 for (x = 0, i = 0; i < count; ++i, ++s) {
3779 c = (unsigned char)*s;
3780 if (!ISXDIGIT(c)) {
3781 endinpos = s-starts;
3782 if (unicode_decode_call_errorhandler(
3783 errors, &errorHandler,
3784 "rawunicodeescape", "truncated \\uXXXX",
3785 &starts, &end, &startinpos, &endinpos, &exc, &s,
3786 &v, &outpos, &p))
3787 goto onError;
3788 goto nextByte;
3789 }
3790 x = (x<<4) & ~0xF;
3791 if (c >= '0' && c <= '9')
3792 x += c - '0';
3793 else if (c >= 'a' && c <= 'f')
3794 x += 10 + c - 'a';
3795 else
3796 x += 10 + c - 'A';
3797 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003798 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003799 /* UCS-2 character */
3800 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003801 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003802 /* UCS-4 character. Either store directly, or as
3803 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003804#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003806#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003807 x -= 0x10000L;
3808 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3809 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003810#endif
3811 } else {
3812 endinpos = s-starts;
3813 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003814 if (unicode_decode_call_errorhandler(
3815 errors, &errorHandler,
3816 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 &starts, &end, &startinpos, &endinpos, &exc, &s,
3818 &v, &outpos, &p))
3819 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003820 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003821 nextByte:
3822 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003824 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003825 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003826 Py_XDECREF(errorHandler);
3827 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003829
Benjamin Peterson29060642009-01-31 22:14:21 +00003830 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003832 Py_XDECREF(errorHandler);
3833 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 return NULL;
3835}
3836
3837PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003838 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003840 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 char *p;
3842 char *q;
3843
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003844#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003845 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003846#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003847 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003848#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003849
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003850 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003852
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003853 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 if (repr == NULL)
3855 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003856 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003857 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003859 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 while (size-- > 0) {
3861 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003862#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 /* Map 32-bit characters to '\Uxxxxxxxx' */
3864 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003865 *p++ = '\\';
3866 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003867 *p++ = hexdigits[(ch >> 28) & 0xf];
3868 *p++ = hexdigits[(ch >> 24) & 0xf];
3869 *p++ = hexdigits[(ch >> 20) & 0xf];
3870 *p++ = hexdigits[(ch >> 16) & 0xf];
3871 *p++ = hexdigits[(ch >> 12) & 0xf];
3872 *p++ = hexdigits[(ch >> 8) & 0xf];
3873 *p++ = hexdigits[(ch >> 4) & 0xf];
3874 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003875 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003876 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003877#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003878 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3879 if (ch >= 0xD800 && ch < 0xDC00) {
3880 Py_UNICODE ch2;
3881 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003882
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 ch2 = *s++;
3884 size--;
3885 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3886 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3887 *p++ = '\\';
3888 *p++ = 'U';
3889 *p++ = hexdigits[(ucs >> 28) & 0xf];
3890 *p++ = hexdigits[(ucs >> 24) & 0xf];
3891 *p++ = hexdigits[(ucs >> 20) & 0xf];
3892 *p++ = hexdigits[(ucs >> 16) & 0xf];
3893 *p++ = hexdigits[(ucs >> 12) & 0xf];
3894 *p++ = hexdigits[(ucs >> 8) & 0xf];
3895 *p++ = hexdigits[(ucs >> 4) & 0xf];
3896 *p++ = hexdigits[ucs & 0xf];
3897 continue;
3898 }
3899 /* Fall through: isolated surrogates are copied as-is */
3900 s--;
3901 size++;
3902 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003903#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 /* Map 16-bit characters to '\uxxxx' */
3905 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 *p++ = '\\';
3907 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003908 *p++ = hexdigits[(ch >> 12) & 0xf];
3909 *p++ = hexdigits[(ch >> 8) & 0xf];
3910 *p++ = hexdigits[(ch >> 4) & 0xf];
3911 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003913 /* Copy everything else as-is */
3914 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915 *p++ = (char) ch;
3916 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003917 size = p - q;
3918
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003919 assert(size > 0);
3920 if (_PyBytes_Resize(&repr, size) < 0)
3921 return NULL;
3922 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923}
3924
3925PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3926{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003927 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003929 PyErr_BadArgument();
3930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003932 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3933 PyUnicode_GET_SIZE(unicode));
3934
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003935 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936}
3937
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003938/* --- Unicode Internal Codec ------------------------------------------- */
3939
3940PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003941 Py_ssize_t size,
3942 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003943{
3944 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003945 Py_ssize_t startinpos;
3946 Py_ssize_t endinpos;
3947 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003948 PyUnicodeObject *v;
3949 Py_UNICODE *p;
3950 const char *end;
3951 const char *reason;
3952 PyObject *errorHandler = NULL;
3953 PyObject *exc = NULL;
3954
Neal Norwitzd43069c2006-01-08 01:12:10 +00003955#ifdef Py_UNICODE_WIDE
3956 Py_UNICODE unimax = PyUnicode_GetMax();
3957#endif
3958
Thomas Wouters89f507f2006-12-13 04:49:30 +00003959 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003960 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3961 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003962 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003963 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003965 p = PyUnicode_AS_UNICODE(v);
3966 end = s + size;
3967
3968 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003969 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003970 /* We have to sanity check the raw data, otherwise doom looms for
3971 some malformed UCS-4 data. */
3972 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003973#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003974 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003975#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003976 end-s < Py_UNICODE_SIZE
3977 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003978 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003979 startinpos = s - starts;
3980 if (end-s < Py_UNICODE_SIZE) {
3981 endinpos = end-starts;
3982 reason = "truncated input";
3983 }
3984 else {
3985 endinpos = s - starts + Py_UNICODE_SIZE;
3986 reason = "illegal code point (> 0x10FFFF)";
3987 }
3988 outpos = p - PyUnicode_AS_UNICODE(v);
3989 if (unicode_decode_call_errorhandler(
3990 errors, &errorHandler,
3991 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003992 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003993 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003994 goto onError;
3995 }
3996 }
3997 else {
3998 p++;
3999 s += Py_UNICODE_SIZE;
4000 }
4001 }
4002
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004003 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004004 goto onError;
4005 Py_XDECREF(errorHandler);
4006 Py_XDECREF(exc);
4007 return (PyObject *)v;
4008
Benjamin Peterson29060642009-01-31 22:14:21 +00004009 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004010 Py_XDECREF(v);
4011 Py_XDECREF(errorHandler);
4012 Py_XDECREF(exc);
4013 return NULL;
4014}
4015
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016/* --- Latin-1 Codec ------------------------------------------------------ */
4017
4018PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004019 Py_ssize_t size,
4020 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021{
4022 PyUnicodeObject *v;
4023 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004024 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004025
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004027 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 Py_UNICODE r = *(unsigned char*)s;
4029 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004030 }
4031
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032 v = _PyUnicode_New(size);
4033 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004034 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004036 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004038 e = s + size;
4039 /* Unrolling the copy makes it much faster by reducing the looping
4040 overhead. This is similar to what many memcpy() implementations do. */
4041 unrolled_end = e - 4;
4042 while (s < unrolled_end) {
4043 p[0] = (unsigned char) s[0];
4044 p[1] = (unsigned char) s[1];
4045 p[2] = (unsigned char) s[2];
4046 p[3] = (unsigned char) s[3];
4047 s += 4;
4048 p += 4;
4049 }
4050 while (s < e)
4051 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004053
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 Py_XDECREF(v);
4056 return NULL;
4057}
4058
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059/* create or adjust a UnicodeEncodeError */
4060static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004061 const char *encoding,
4062 const Py_UNICODE *unicode, Py_ssize_t size,
4063 Py_ssize_t startpos, Py_ssize_t endpos,
4064 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 *exceptionObject = PyUnicodeEncodeError_Create(
4068 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 }
4070 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004071 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4072 goto onError;
4073 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4074 goto onError;
4075 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4076 goto onError;
4077 return;
4078 onError:
4079 Py_DECREF(*exceptionObject);
4080 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081 }
4082}
4083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084/* raises a UnicodeEncodeError */
4085static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004086 const char *encoding,
4087 const Py_UNICODE *unicode, Py_ssize_t size,
4088 Py_ssize_t startpos, Py_ssize_t endpos,
4089 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090{
4091 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004092 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004094 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095}
4096
4097/* error handling callback helper:
4098 build arguments, call the callback and check the arguments,
4099 put the result into newpos and return the replacement string, which
4100 has to be freed by the caller */
4101static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 PyObject **errorHandler,
4103 const char *encoding, const char *reason,
4104 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4105 Py_ssize_t startpos, Py_ssize_t endpos,
4106 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004108 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109
4110 PyObject *restuple;
4111 PyObject *resunicode;
4112
4113 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 }
4118
4119 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123
4124 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004127 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004129 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 Py_DECREF(restuple);
4131 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004133 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 &resunicode, newpos)) {
4135 Py_DECREF(restuple);
4136 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004138 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4139 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4140 Py_DECREF(restuple);
4141 return NULL;
4142 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004145 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4147 Py_DECREF(restuple);
4148 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 Py_INCREF(resunicode);
4151 Py_DECREF(restuple);
4152 return resunicode;
4153}
4154
4155static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 Py_ssize_t size,
4157 const char *errors,
4158 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159{
4160 /* output object */
4161 PyObject *res;
4162 /* pointers to the beginning and end+1 of input */
4163 const Py_UNICODE *startp = p;
4164 const Py_UNICODE *endp = p + size;
4165 /* pointer to the beginning of the unencodable characters */
4166 /* const Py_UNICODE *badp = NULL; */
4167 /* pointer into the output */
4168 char *str;
4169 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004170 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004171 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4172 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 PyObject *errorHandler = NULL;
4174 PyObject *exc = NULL;
4175 /* the following variable is used for caching string comparisons
4176 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4177 int known_errorHandler = -1;
4178
4179 /* allocate enough for a simple encoding without
4180 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004181 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004182 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004183 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004185 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004186 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 ressize = size;
4188
4189 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 /* can we encode this? */
4193 if (c<limit) {
4194 /* no overflow check, because we know that the space is enough */
4195 *str++ = (char)c;
4196 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004197 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 else {
4199 Py_ssize_t unicodepos = p-startp;
4200 Py_ssize_t requiredsize;
4201 PyObject *repunicode;
4202 Py_ssize_t repsize;
4203 Py_ssize_t newpos;
4204 Py_ssize_t respos;
4205 Py_UNICODE *uni2;
4206 /* startpos for collecting unencodable chars */
4207 const Py_UNICODE *collstart = p;
4208 const Py_UNICODE *collend = p;
4209 /* find all unecodable characters */
4210 while ((collend < endp) && ((*collend)>=limit))
4211 ++collend;
4212 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4213 if (known_errorHandler==-1) {
4214 if ((errors==NULL) || (!strcmp(errors, "strict")))
4215 known_errorHandler = 1;
4216 else if (!strcmp(errors, "replace"))
4217 known_errorHandler = 2;
4218 else if (!strcmp(errors, "ignore"))
4219 known_errorHandler = 3;
4220 else if (!strcmp(errors, "xmlcharrefreplace"))
4221 known_errorHandler = 4;
4222 else
4223 known_errorHandler = 0;
4224 }
4225 switch (known_errorHandler) {
4226 case 1: /* strict */
4227 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4228 goto onError;
4229 case 2: /* replace */
4230 while (collstart++<collend)
4231 *str++ = '?'; /* fall through */
4232 case 3: /* ignore */
4233 p = collend;
4234 break;
4235 case 4: /* xmlcharrefreplace */
4236 respos = str - PyBytes_AS_STRING(res);
4237 /* determine replacement size (temporarily (mis)uses p) */
4238 for (p = collstart, repsize = 0; p < collend; ++p) {
4239 if (*p<10)
4240 repsize += 2+1+1;
4241 else if (*p<100)
4242 repsize += 2+2+1;
4243 else if (*p<1000)
4244 repsize += 2+3+1;
4245 else if (*p<10000)
4246 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004247#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004248 else
4249 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004250#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 else if (*p<100000)
4252 repsize += 2+5+1;
4253 else if (*p<1000000)
4254 repsize += 2+6+1;
4255 else
4256 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004257#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 }
4259 requiredsize = respos+repsize+(endp-collend);
4260 if (requiredsize > ressize) {
4261 if (requiredsize<2*ressize)
4262 requiredsize = 2*ressize;
4263 if (_PyBytes_Resize(&res, requiredsize))
4264 goto onError;
4265 str = PyBytes_AS_STRING(res) + respos;
4266 ressize = requiredsize;
4267 }
4268 /* generate replacement (temporarily (mis)uses p) */
4269 for (p = collstart; p < collend; ++p) {
4270 str += sprintf(str, "&#%d;", (int)*p);
4271 }
4272 p = collend;
4273 break;
4274 default:
4275 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4276 encoding, reason, startp, size, &exc,
4277 collstart-startp, collend-startp, &newpos);
4278 if (repunicode == NULL)
4279 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004280 if (PyBytes_Check(repunicode)) {
4281 /* Directly copy bytes result to output. */
4282 repsize = PyBytes_Size(repunicode);
4283 if (repsize > 1) {
4284 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004285 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004286 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4287 Py_DECREF(repunicode);
4288 goto onError;
4289 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004290 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004291 ressize += repsize-1;
4292 }
4293 memcpy(str, PyBytes_AsString(repunicode), repsize);
4294 str += repsize;
4295 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004296 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004297 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004298 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 /* need more space? (at least enough for what we
4300 have+the replacement+the rest of the string, so
4301 we won't have to check space for encodable characters) */
4302 respos = str - PyBytes_AS_STRING(res);
4303 repsize = PyUnicode_GET_SIZE(repunicode);
4304 requiredsize = respos+repsize+(endp-collend);
4305 if (requiredsize > ressize) {
4306 if (requiredsize<2*ressize)
4307 requiredsize = 2*ressize;
4308 if (_PyBytes_Resize(&res, requiredsize)) {
4309 Py_DECREF(repunicode);
4310 goto onError;
4311 }
4312 str = PyBytes_AS_STRING(res) + respos;
4313 ressize = requiredsize;
4314 }
4315 /* check if there is anything unencodable in the replacement
4316 and copy it to the output */
4317 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4318 c = *uni2;
4319 if (c >= limit) {
4320 raise_encode_exception(&exc, encoding, startp, size,
4321 unicodepos, unicodepos+1, reason);
4322 Py_DECREF(repunicode);
4323 goto onError;
4324 }
4325 *str = (char)c;
4326 }
4327 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004328 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004329 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004330 }
4331 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004332 /* Resize if we allocated to much */
4333 size = str - PyBytes_AS_STRING(res);
4334 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004335 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004336 if (_PyBytes_Resize(&res, size) < 0)
4337 goto onError;
4338 }
4339
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 Py_XDECREF(errorHandler);
4341 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004342 return res;
4343
4344 onError:
4345 Py_XDECREF(res);
4346 Py_XDECREF(errorHandler);
4347 Py_XDECREF(exc);
4348 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004349}
4350
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 Py_ssize_t size,
4353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356}
4357
4358PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4359{
4360 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 PyErr_BadArgument();
4362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 }
4364 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 PyUnicode_GET_SIZE(unicode),
4366 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367}
4368
4369/* --- 7-bit ASCII Codec -------------------------------------------------- */
4370
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 Py_ssize_t size,
4373 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 PyUnicodeObject *v;
4377 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004378 Py_ssize_t startinpos;
4379 Py_ssize_t endinpos;
4380 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 const char *e;
4382 PyObject *errorHandler = NULL;
4383 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004384
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004386 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 Py_UNICODE r = *(unsigned char*)s;
4388 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004389 }
Tim Petersced69f82003-09-16 20:30:58 +00004390
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 v = _PyUnicode_New(size);
4392 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 e = s + size;
4398 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 register unsigned char c = (unsigned char)*s;
4400 if (c < 128) {
4401 *p++ = c;
4402 ++s;
4403 }
4404 else {
4405 startinpos = s-starts;
4406 endinpos = startinpos + 1;
4407 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4408 if (unicode_decode_call_errorhandler(
4409 errors, &errorHandler,
4410 "ascii", "ordinal not in range(128)",
4411 &starts, &e, &startinpos, &endinpos, &exc, &s,
4412 &v, &outpos, &p))
4413 goto onError;
4414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004416 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4418 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 Py_XDECREF(errorHandler);
4420 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004422
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 Py_XDECREF(errorHandler);
4426 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 return NULL;
4428}
4429
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 Py_ssize_t size,
4432 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435}
4436
4437PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4438{
4439 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 PyErr_BadArgument();
4441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 }
4443 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 PyUnicode_GET_SIZE(unicode),
4445 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446}
4447
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004448#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004449
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004450/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004451
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004452#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004453#define NEED_RETRY
4454#endif
4455
4456/* XXX This code is limited to "true" double-byte encodings, as
4457 a) it assumes an incomplete character consists of a single byte, and
4458 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004460
4461static int is_dbcs_lead_byte(const char *s, int offset)
4462{
4463 const char *curr = s + offset;
4464
4465 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 const char *prev = CharPrev(s, curr);
4467 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004468 }
4469 return 0;
4470}
4471
4472/*
4473 * Decode MBCS string into unicode object. If 'final' is set, converts
4474 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4475 */
4476static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 const char *s, /* MBCS string */
4478 int size, /* sizeof MBCS string */
4479 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004480{
4481 Py_UNICODE *p;
4482 Py_ssize_t n = 0;
4483 int usize = 0;
4484
4485 assert(size >= 0);
4486
4487 /* Skip trailing lead-byte unless 'final' is set */
4488 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004490
4491 /* First get the size of the result */
4492 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4494 if (usize == 0) {
4495 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4496 return -1;
4497 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004498 }
4499
4500 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 /* Create unicode object */
4502 *v = _PyUnicode_New(usize);
4503 if (*v == NULL)
4504 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004505 }
4506 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 /* Extend unicode object */
4508 n = PyUnicode_GET_SIZE(*v);
4509 if (_PyUnicode_Resize(v, n + usize) < 0)
4510 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004511 }
4512
4513 /* Do the conversion */
4514 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 p = PyUnicode_AS_UNICODE(*v) + n;
4516 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4517 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4518 return -1;
4519 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004520 }
4521
4522 return size;
4523}
4524
4525PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 Py_ssize_t size,
4527 const char *errors,
4528 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004529{
4530 PyUnicodeObject *v = NULL;
4531 int done;
4532
4533 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004535
4536#ifdef NEED_RETRY
4537 retry:
4538 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004540 else
4541#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004543
4544 if (done < 0) {
4545 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004547 }
4548
4549 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004551
4552#ifdef NEED_RETRY
4553 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 s += done;
4555 size -= done;
4556 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004557 }
4558#endif
4559
4560 return (PyObject *)v;
4561}
4562
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004563PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 Py_ssize_t size,
4565 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004566{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004567 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4568}
4569
4570/*
4571 * Convert unicode into string object (MBCS).
4572 * Returns 0 if succeed, -1 otherwise.
4573 */
4574static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 const Py_UNICODE *p, /* unicode */
4576 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004577{
4578 int mbcssize = 0;
4579 Py_ssize_t n = 0;
4580
4581 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004582
4583 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004584 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4586 if (mbcssize == 0) {
4587 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4588 return -1;
4589 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004590 }
4591
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004592 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 /* Create string object */
4594 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4595 if (*repr == NULL)
4596 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004597 }
4598 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 /* Extend string object */
4600 n = PyBytes_Size(*repr);
4601 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4602 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004603 }
4604
4605 /* Do the conversion */
4606 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 char *s = PyBytes_AS_STRING(*repr) + n;
4608 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4609 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4610 return -1;
4611 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004612 }
4613
4614 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004615}
4616
4617PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 Py_ssize_t size,
4619 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004620{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004621 PyObject *repr = NULL;
4622 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004623
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004624#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004626 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004628 else
4629#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004630 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004631
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004632 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 Py_XDECREF(repr);
4634 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004635 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004636
4637#ifdef NEED_RETRY
4638 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004639 p += INT_MAX;
4640 size -= INT_MAX;
4641 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004642 }
4643#endif
4644
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004645 return repr;
4646}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004647
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004648PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4649{
4650 if (!PyUnicode_Check(unicode)) {
4651 PyErr_BadArgument();
4652 return NULL;
4653 }
4654 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 PyUnicode_GET_SIZE(unicode),
4656 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004657}
4658
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004659#undef NEED_RETRY
4660
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004661#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004662
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663/* --- Character Mapping Codec -------------------------------------------- */
4664
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 Py_ssize_t size,
4667 PyObject *mapping,
4668 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004671 Py_ssize_t startinpos;
4672 Py_ssize_t endinpos;
4673 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 PyUnicodeObject *v;
4676 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004677 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 PyObject *errorHandler = NULL;
4679 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004680 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004681 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004682
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 /* Default to Latin-1 */
4684 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686
4687 v = _PyUnicode_New(size);
4688 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004691 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004694 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 mapstring = PyUnicode_AS_UNICODE(mapping);
4696 maplen = PyUnicode_GET_SIZE(mapping);
4697 while (s < e) {
4698 unsigned char ch = *s;
4699 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 if (ch < maplen)
4702 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 if (x == 0xfffe) {
4705 /* undefined mapping */
4706 outpos = p-PyUnicode_AS_UNICODE(v);
4707 startinpos = s-starts;
4708 endinpos = startinpos+1;
4709 if (unicode_decode_call_errorhandler(
4710 errors, &errorHandler,
4711 "charmap", "character maps to <undefined>",
4712 &starts, &e, &startinpos, &endinpos, &exc, &s,
4713 &v, &outpos, &p)) {
4714 goto onError;
4715 }
4716 continue;
4717 }
4718 *p++ = x;
4719 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004720 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004721 }
4722 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 while (s < e) {
4724 unsigned char ch = *s;
4725 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004726
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4728 w = PyLong_FromLong((long)ch);
4729 if (w == NULL)
4730 goto onError;
4731 x = PyObject_GetItem(mapping, w);
4732 Py_DECREF(w);
4733 if (x == NULL) {
4734 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4735 /* No mapping found means: mapping is undefined. */
4736 PyErr_Clear();
4737 x = Py_None;
4738 Py_INCREF(x);
4739 } else
4740 goto onError;
4741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004742
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 /* Apply mapping */
4744 if (PyLong_Check(x)) {
4745 long value = PyLong_AS_LONG(x);
4746 if (value < 0 || value > 65535) {
4747 PyErr_SetString(PyExc_TypeError,
4748 "character mapping must be in range(65536)");
4749 Py_DECREF(x);
4750 goto onError;
4751 }
4752 *p++ = (Py_UNICODE)value;
4753 }
4754 else if (x == Py_None) {
4755 /* undefined mapping */
4756 outpos = p-PyUnicode_AS_UNICODE(v);
4757 startinpos = s-starts;
4758 endinpos = startinpos+1;
4759 if (unicode_decode_call_errorhandler(
4760 errors, &errorHandler,
4761 "charmap", "character maps to <undefined>",
4762 &starts, &e, &startinpos, &endinpos, &exc, &s,
4763 &v, &outpos, &p)) {
4764 Py_DECREF(x);
4765 goto onError;
4766 }
4767 Py_DECREF(x);
4768 continue;
4769 }
4770 else if (PyUnicode_Check(x)) {
4771 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004772
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 if (targetsize == 1)
4774 /* 1-1 mapping */
4775 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004776
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 else if (targetsize > 1) {
4778 /* 1-n mapping */
4779 if (targetsize > extrachars) {
4780 /* resize first */
4781 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4782 Py_ssize_t needed = (targetsize - extrachars) + \
4783 (targetsize << 2);
4784 extrachars += needed;
4785 /* XXX overflow detection missing */
4786 if (_PyUnicode_Resize(&v,
4787 PyUnicode_GET_SIZE(v) + needed) < 0) {
4788 Py_DECREF(x);
4789 goto onError;
4790 }
4791 p = PyUnicode_AS_UNICODE(v) + oldpos;
4792 }
4793 Py_UNICODE_COPY(p,
4794 PyUnicode_AS_UNICODE(x),
4795 targetsize);
4796 p += targetsize;
4797 extrachars -= targetsize;
4798 }
4799 /* 1-0 mapping: skip the character */
4800 }
4801 else {
4802 /* wrong return value */
4803 PyErr_SetString(PyExc_TypeError,
4804 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004805 Py_DECREF(x);
4806 goto onError;
4807 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 Py_DECREF(x);
4809 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 }
4812 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4814 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815 Py_XDECREF(errorHandler);
4816 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004818
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 Py_XDECREF(errorHandler);
4821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 Py_XDECREF(v);
4823 return NULL;
4824}
4825
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004826/* Charmap encoding: the lookup table */
4827
4828struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004829 PyObject_HEAD
4830 unsigned char level1[32];
4831 int count2, count3;
4832 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004833};
4834
4835static PyObject*
4836encoding_map_size(PyObject *obj, PyObject* args)
4837{
4838 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004839 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004841}
4842
4843static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004844 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 PyDoc_STR("Return the size (in bytes) of this object") },
4846 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004847};
4848
4849static void
4850encoding_map_dealloc(PyObject* o)
4851{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004852 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004853}
4854
4855static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004856 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 "EncodingMap", /*tp_name*/
4858 sizeof(struct encoding_map), /*tp_basicsize*/
4859 0, /*tp_itemsize*/
4860 /* methods */
4861 encoding_map_dealloc, /*tp_dealloc*/
4862 0, /*tp_print*/
4863 0, /*tp_getattr*/
4864 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004865 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 0, /*tp_repr*/
4867 0, /*tp_as_number*/
4868 0, /*tp_as_sequence*/
4869 0, /*tp_as_mapping*/
4870 0, /*tp_hash*/
4871 0, /*tp_call*/
4872 0, /*tp_str*/
4873 0, /*tp_getattro*/
4874 0, /*tp_setattro*/
4875 0, /*tp_as_buffer*/
4876 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4877 0, /*tp_doc*/
4878 0, /*tp_traverse*/
4879 0, /*tp_clear*/
4880 0, /*tp_richcompare*/
4881 0, /*tp_weaklistoffset*/
4882 0, /*tp_iter*/
4883 0, /*tp_iternext*/
4884 encoding_map_methods, /*tp_methods*/
4885 0, /*tp_members*/
4886 0, /*tp_getset*/
4887 0, /*tp_base*/
4888 0, /*tp_dict*/
4889 0, /*tp_descr_get*/
4890 0, /*tp_descr_set*/
4891 0, /*tp_dictoffset*/
4892 0, /*tp_init*/
4893 0, /*tp_alloc*/
4894 0, /*tp_new*/
4895 0, /*tp_free*/
4896 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004897};
4898
4899PyObject*
4900PyUnicode_BuildEncodingMap(PyObject* string)
4901{
4902 Py_UNICODE *decode;
4903 PyObject *result;
4904 struct encoding_map *mresult;
4905 int i;
4906 int need_dict = 0;
4907 unsigned char level1[32];
4908 unsigned char level2[512];
4909 unsigned char *mlevel1, *mlevel2, *mlevel3;
4910 int count2 = 0, count3 = 0;
4911
4912 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4913 PyErr_BadArgument();
4914 return NULL;
4915 }
4916 decode = PyUnicode_AS_UNICODE(string);
4917 memset(level1, 0xFF, sizeof level1);
4918 memset(level2, 0xFF, sizeof level2);
4919
4920 /* If there isn't a one-to-one mapping of NULL to \0,
4921 or if there are non-BMP characters, we need to use
4922 a mapping dictionary. */
4923 if (decode[0] != 0)
4924 need_dict = 1;
4925 for (i = 1; i < 256; i++) {
4926 int l1, l2;
4927 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004928#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004929 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004930#endif
4931 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004932 need_dict = 1;
4933 break;
4934 }
4935 if (decode[i] == 0xFFFE)
4936 /* unmapped character */
4937 continue;
4938 l1 = decode[i] >> 11;
4939 l2 = decode[i] >> 7;
4940 if (level1[l1] == 0xFF)
4941 level1[l1] = count2++;
4942 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004943 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004944 }
4945
4946 if (count2 >= 0xFF || count3 >= 0xFF)
4947 need_dict = 1;
4948
4949 if (need_dict) {
4950 PyObject *result = PyDict_New();
4951 PyObject *key, *value;
4952 if (!result)
4953 return NULL;
4954 for (i = 0; i < 256; i++) {
4955 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004956 key = PyLong_FromLong(decode[i]);
4957 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004958 if (!key || !value)
4959 goto failed1;
4960 if (PyDict_SetItem(result, key, value) == -1)
4961 goto failed1;
4962 Py_DECREF(key);
4963 Py_DECREF(value);
4964 }
4965 return result;
4966 failed1:
4967 Py_XDECREF(key);
4968 Py_XDECREF(value);
4969 Py_DECREF(result);
4970 return NULL;
4971 }
4972
4973 /* Create a three-level trie */
4974 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4975 16*count2 + 128*count3 - 1);
4976 if (!result)
4977 return PyErr_NoMemory();
4978 PyObject_Init(result, &EncodingMapType);
4979 mresult = (struct encoding_map*)result;
4980 mresult->count2 = count2;
4981 mresult->count3 = count3;
4982 mlevel1 = mresult->level1;
4983 mlevel2 = mresult->level23;
4984 mlevel3 = mresult->level23 + 16*count2;
4985 memcpy(mlevel1, level1, 32);
4986 memset(mlevel2, 0xFF, 16*count2);
4987 memset(mlevel3, 0, 128*count3);
4988 count3 = 0;
4989 for (i = 1; i < 256; i++) {
4990 int o1, o2, o3, i2, i3;
4991 if (decode[i] == 0xFFFE)
4992 /* unmapped character */
4993 continue;
4994 o1 = decode[i]>>11;
4995 o2 = (decode[i]>>7) & 0xF;
4996 i2 = 16*mlevel1[o1] + o2;
4997 if (mlevel2[i2] == 0xFF)
4998 mlevel2[i2] = count3++;
4999 o3 = decode[i] & 0x7F;
5000 i3 = 128*mlevel2[i2] + o3;
5001 mlevel3[i3] = i;
5002 }
5003 return result;
5004}
5005
5006static int
5007encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5008{
5009 struct encoding_map *map = (struct encoding_map*)mapping;
5010 int l1 = c>>11;
5011 int l2 = (c>>7) & 0xF;
5012 int l3 = c & 0x7F;
5013 int i;
5014
5015#ifdef Py_UNICODE_WIDE
5016 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005018 }
5019#endif
5020 if (c == 0)
5021 return 0;
5022 /* level 1*/
5023 i = map->level1[l1];
5024 if (i == 0xFF) {
5025 return -1;
5026 }
5027 /* level 2*/
5028 i = map->level23[16*i+l2];
5029 if (i == 0xFF) {
5030 return -1;
5031 }
5032 /* level 3 */
5033 i = map->level23[16*map->count2 + 128*i + l3];
5034 if (i == 0) {
5035 return -1;
5036 }
5037 return i;
5038}
5039
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040/* Lookup the character ch in the mapping. If the character
5041 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005042 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044{
Christian Heimes217cfd12007-12-02 14:31:20 +00005045 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046 PyObject *x;
5047
5048 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 x = PyObject_GetItem(mapping, w);
5051 Py_DECREF(w);
5052 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5054 /* No mapping found means: mapping is undefined. */
5055 PyErr_Clear();
5056 x = Py_None;
5057 Py_INCREF(x);
5058 return x;
5059 } else
5060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005062 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005064 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 long value = PyLong_AS_LONG(x);
5066 if (value < 0 || value > 255) {
5067 PyErr_SetString(PyExc_TypeError,
5068 "character mapping must be in range(256)");
5069 Py_DECREF(x);
5070 return NULL;
5071 }
5072 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005074 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 /* wrong return value */
5078 PyErr_Format(PyExc_TypeError,
5079 "character mapping must return integer, bytes or None, not %.400s",
5080 x->ob_type->tp_name);
5081 Py_DECREF(x);
5082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083 }
5084}
5085
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005086static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005087charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005089 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5090 /* exponentially overallocate to minimize reallocations */
5091 if (requiredsize < 2*outsize)
5092 requiredsize = 2*outsize;
5093 if (_PyBytes_Resize(outobj, requiredsize))
5094 return -1;
5095 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005096}
5097
Benjamin Peterson14339b62009-01-31 16:36:08 +00005098typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005100}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005101/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005102 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 space is available. Return a new reference to the object that
5104 was put in the output buffer, or Py_None, if the mapping was undefined
5105 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005106 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005108charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005111 PyObject *rep;
5112 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005113 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114
Christian Heimes90aa7642007-12-19 02:45:37 +00005115 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005116 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005118 if (res == -1)
5119 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 if (outsize<requiredsize)
5121 if (charmapencode_resize(outobj, outpos, requiredsize))
5122 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005123 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 outstart[(*outpos)++] = (char)res;
5125 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005126 }
5127
5128 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005131 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 Py_DECREF(rep);
5133 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005134 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 if (PyLong_Check(rep)) {
5136 Py_ssize_t requiredsize = *outpos+1;
5137 if (outsize<requiredsize)
5138 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5139 Py_DECREF(rep);
5140 return enc_EXCEPTION;
5141 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005142 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005144 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 else {
5146 const char *repchars = PyBytes_AS_STRING(rep);
5147 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5148 Py_ssize_t requiredsize = *outpos+repsize;
5149 if (outsize<requiredsize)
5150 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5151 Py_DECREF(rep);
5152 return enc_EXCEPTION;
5153 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005154 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 memcpy(outstart + *outpos, repchars, repsize);
5156 *outpos += repsize;
5157 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005159 Py_DECREF(rep);
5160 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005161}
5162
5163/* handle an error in PyUnicode_EncodeCharmap
5164 Return 0 on success, -1 on error */
5165static
5166int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005167 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005169 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005170 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171{
5172 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005173 Py_ssize_t repsize;
5174 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005175 Py_UNICODE *uni2;
5176 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005177 Py_ssize_t collstartpos = *inpos;
5178 Py_ssize_t collendpos = *inpos+1;
5179 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180 char *encoding = "charmap";
5181 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005182 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005184 /* find all unencodable characters */
5185 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005186 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005187 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 int res = encoding_map_lookup(p[collendpos], mapping);
5189 if (res != -1)
5190 break;
5191 ++collendpos;
5192 continue;
5193 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005194
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 rep = charmapencode_lookup(p[collendpos], mapping);
5196 if (rep==NULL)
5197 return -1;
5198 else if (rep!=Py_None) {
5199 Py_DECREF(rep);
5200 break;
5201 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005202 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 }
5205 /* cache callback name lookup
5206 * (if not done yet, i.e. it's the first error) */
5207 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 if ((errors==NULL) || (!strcmp(errors, "strict")))
5209 *known_errorHandler = 1;
5210 else if (!strcmp(errors, "replace"))
5211 *known_errorHandler = 2;
5212 else if (!strcmp(errors, "ignore"))
5213 *known_errorHandler = 3;
5214 else if (!strcmp(errors, "xmlcharrefreplace"))
5215 *known_errorHandler = 4;
5216 else
5217 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 }
5219 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005220 case 1: /* strict */
5221 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5222 return -1;
5223 case 2: /* replace */
5224 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005225 x = charmapencode_output('?', mapping, res, respos);
5226 if (x==enc_EXCEPTION) {
5227 return -1;
5228 }
5229 else if (x==enc_FAILED) {
5230 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5231 return -1;
5232 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005233 }
5234 /* fall through */
5235 case 3: /* ignore */
5236 *inpos = collendpos;
5237 break;
5238 case 4: /* xmlcharrefreplace */
5239 /* generate replacement (temporarily (mis)uses p) */
5240 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 char buffer[2+29+1+1];
5242 char *cp;
5243 sprintf(buffer, "&#%d;", (int)p[collpos]);
5244 for (cp = buffer; *cp; ++cp) {
5245 x = charmapencode_output(*cp, mapping, res, respos);
5246 if (x==enc_EXCEPTION)
5247 return -1;
5248 else if (x==enc_FAILED) {
5249 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5250 return -1;
5251 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005252 }
5253 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005254 *inpos = collendpos;
5255 break;
5256 default:
5257 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 encoding, reason, p, size, exceptionObject,
5259 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005260 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005262 if (PyBytes_Check(repunicode)) {
5263 /* Directly copy bytes result to output. */
5264 Py_ssize_t outsize = PyBytes_Size(*res);
5265 Py_ssize_t requiredsize;
5266 repsize = PyBytes_Size(repunicode);
5267 requiredsize = *respos + repsize;
5268 if (requiredsize > outsize)
5269 /* Make room for all additional bytes. */
5270 if (charmapencode_resize(res, respos, requiredsize)) {
5271 Py_DECREF(repunicode);
5272 return -1;
5273 }
5274 memcpy(PyBytes_AsString(*res) + *respos,
5275 PyBytes_AsString(repunicode), repsize);
5276 *respos += repsize;
5277 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005278 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005279 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005281 /* generate replacement */
5282 repsize = PyUnicode_GET_SIZE(repunicode);
5283 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 x = charmapencode_output(*uni2, mapping, res, respos);
5285 if (x==enc_EXCEPTION) {
5286 return -1;
5287 }
5288 else if (x==enc_FAILED) {
5289 Py_DECREF(repunicode);
5290 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5291 return -1;
5292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005293 }
5294 *inpos = newpos;
5295 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 }
5297 return 0;
5298}
5299
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 Py_ssize_t size,
5302 PyObject *mapping,
5303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005305 /* output object */
5306 PyObject *res = NULL;
5307 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005309 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005310 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005311 PyObject *errorHandler = NULL;
5312 PyObject *exc = NULL;
5313 /* the following variable is used for caching string comparisons
5314 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5315 * 3=ignore, 4=xmlcharrefreplace */
5316 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317
5318 /* Default to Latin-1 */
5319 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005322 /* allocate enough for a simple encoding without
5323 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005324 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005325 if (res == NULL)
5326 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005327 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 /* try to encode it */
5332 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5333 if (x==enc_EXCEPTION) /* error */
5334 goto onError;
5335 if (x==enc_FAILED) { /* unencodable character */
5336 if (charmap_encoding_error(p, size, &inpos, mapping,
5337 &exc,
5338 &known_errorHandler, &errorHandler, errors,
5339 &res, &respos)) {
5340 goto onError;
5341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005342 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 else
5344 /* done with this character => adjust input position */
5345 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005349 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005350 if (_PyBytes_Resize(&res, respos) < 0)
5351 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005352
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 Py_XDECREF(exc);
5354 Py_XDECREF(errorHandler);
5355 return res;
5356
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005358 Py_XDECREF(res);
5359 Py_XDECREF(exc);
5360 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 return NULL;
5362}
5363
5364PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366{
5367 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 PyErr_BadArgument();
5369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 }
5371 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 PyUnicode_GET_SIZE(unicode),
5373 mapping,
5374 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375}
5376
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377/* create or adjust a UnicodeTranslateError */
5378static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 const Py_UNICODE *unicode, Py_ssize_t size,
5380 Py_ssize_t startpos, Py_ssize_t endpos,
5381 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005383 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005384 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 }
5387 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5389 goto onError;
5390 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5391 goto onError;
5392 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5393 goto onError;
5394 return;
5395 onError:
5396 Py_DECREF(*exceptionObject);
5397 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 }
5399}
5400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005401/* raises a UnicodeTranslateError */
5402static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 const Py_UNICODE *unicode, Py_ssize_t size,
5404 Py_ssize_t startpos, Py_ssize_t endpos,
5405 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406{
5407 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005409 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005411}
5412
5413/* error handling callback helper:
5414 build arguments, call the callback and check the arguments,
5415 put the result into newpos and return the replacement string, which
5416 has to be freed by the caller */
5417static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 PyObject **errorHandler,
5419 const char *reason,
5420 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5421 Py_ssize_t startpos, Py_ssize_t endpos,
5422 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005423{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005424 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005426 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427 PyObject *restuple;
5428 PyObject *resunicode;
5429
5430 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 }
5435
5436 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440
5441 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005443 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005446 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 Py_DECREF(restuple);
5448 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449 }
5450 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 &resunicode, &i_newpos)) {
5452 Py_DECREF(restuple);
5453 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005454 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005455 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005457 else
5458 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005459 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5461 Py_DECREF(restuple);
5462 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005463 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005464 Py_INCREF(resunicode);
5465 Py_DECREF(restuple);
5466 return resunicode;
5467}
5468
5469/* Lookup the character ch in the mapping and put the result in result,
5470 which must be decrefed by the caller.
5471 Return 0 on success, -1 on error */
5472static
5473int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5474{
Christian Heimes217cfd12007-12-02 14:31:20 +00005475 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 PyObject *x;
5477
5478 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 x = PyObject_GetItem(mapping, w);
5481 Py_DECREF(w);
5482 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5484 /* No mapping found means: use 1:1 mapping. */
5485 PyErr_Clear();
5486 *result = NULL;
5487 return 0;
5488 } else
5489 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005490 }
5491 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 *result = x;
5493 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005494 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005495 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 long value = PyLong_AS_LONG(x);
5497 long max = PyUnicode_GetMax();
5498 if (value < 0 || value > max) {
5499 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005500 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 Py_DECREF(x);
5502 return -1;
5503 }
5504 *result = x;
5505 return 0;
5506 }
5507 else if (PyUnicode_Check(x)) {
5508 *result = x;
5509 return 0;
5510 }
5511 else {
5512 /* wrong return value */
5513 PyErr_SetString(PyExc_TypeError,
5514 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005515 Py_DECREF(x);
5516 return -1;
5517 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518}
5519/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 if not reallocate and adjust various state variables.
5521 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005522static
Walter Dörwald4894c302003-10-24 14:25:28 +00005523int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005525{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005526 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005527 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 /* remember old output position */
5529 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5530 /* exponentially overallocate to minimize reallocations */
5531 if (requiredsize < 2 * oldsize)
5532 requiredsize = 2 * oldsize;
5533 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5534 return -1;
5535 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536 }
5537 return 0;
5538}
5539/* lookup the character, put the result in the output string and adjust
5540 various state variables. Return a new reference to the object that
5541 was put in the output buffer in *result, or Py_None, if the mapping was
5542 undefined (in which case no character was written).
5543 The called must decref result.
5544 Return 0 on success, -1 on error. */
5545static
Walter Dörwald4894c302003-10-24 14:25:28 +00005546int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5548 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005549{
Walter Dörwald4894c302003-10-24 14:25:28 +00005550 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005552 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 /* not found => default to 1:1 mapping */
5554 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555 }
5556 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005558 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 /* no overflow check, because we know that the space is enough */
5560 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 }
5562 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5564 if (repsize==1) {
5565 /* no overflow check, because we know that the space is enough */
5566 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5567 }
5568 else if (repsize!=0) {
5569 /* more than one character */
5570 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5571 (insize - (curinp-startinp)) +
5572 repsize - 1;
5573 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5574 return -1;
5575 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5576 *outp += repsize;
5577 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 }
5579 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005581 return 0;
5582}
5583
5584PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 Py_ssize_t size,
5586 PyObject *mapping,
5587 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005589 /* output object */
5590 PyObject *res = NULL;
5591 /* pointers to the beginning and end+1 of input */
5592 const Py_UNICODE *startp = p;
5593 const Py_UNICODE *endp = p + size;
5594 /* pointer into the output */
5595 Py_UNICODE *str;
5596 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005597 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 char *reason = "character maps to <undefined>";
5599 PyObject *errorHandler = NULL;
5600 PyObject *exc = NULL;
5601 /* the following variable is used for caching string comparisons
5602 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5603 * 3=ignore, 4=xmlcharrefreplace */
5604 int known_errorHandler = -1;
5605
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 PyErr_BadArgument();
5608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005610
5611 /* allocate enough for a simple 1:1 translation without
5612 replacements, if we need more, we'll resize */
5613 res = PyUnicode_FromUnicode(NULL, size);
5614 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 /* try to encode it */
5622 PyObject *x = NULL;
5623 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5624 Py_XDECREF(x);
5625 goto onError;
5626 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005627 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 if (x!=Py_None) /* it worked => adjust input pointer */
5629 ++p;
5630 else { /* untranslatable character */
5631 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5632 Py_ssize_t repsize;
5633 Py_ssize_t newpos;
5634 Py_UNICODE *uni2;
5635 /* startpos for collecting untranslatable chars */
5636 const Py_UNICODE *collstart = p;
5637 const Py_UNICODE *collend = p+1;
5638 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 /* find all untranslatable characters */
5641 while (collend < endp) {
5642 if (charmaptranslate_lookup(*collend, mapping, &x))
5643 goto onError;
5644 Py_XDECREF(x);
5645 if (x!=Py_None)
5646 break;
5647 ++collend;
5648 }
5649 /* cache callback name lookup
5650 * (if not done yet, i.e. it's the first error) */
5651 if (known_errorHandler==-1) {
5652 if ((errors==NULL) || (!strcmp(errors, "strict")))
5653 known_errorHandler = 1;
5654 else if (!strcmp(errors, "replace"))
5655 known_errorHandler = 2;
5656 else if (!strcmp(errors, "ignore"))
5657 known_errorHandler = 3;
5658 else if (!strcmp(errors, "xmlcharrefreplace"))
5659 known_errorHandler = 4;
5660 else
5661 known_errorHandler = 0;
5662 }
5663 switch (known_errorHandler) {
5664 case 1: /* strict */
5665 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005666 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 case 2: /* replace */
5668 /* No need to check for space, this is a 1:1 replacement */
5669 for (coll = collstart; coll<collend; ++coll)
5670 *str++ = '?';
5671 /* fall through */
5672 case 3: /* ignore */
5673 p = collend;
5674 break;
5675 case 4: /* xmlcharrefreplace */
5676 /* generate replacement (temporarily (mis)uses p) */
5677 for (p = collstart; p < collend; ++p) {
5678 char buffer[2+29+1+1];
5679 char *cp;
5680 sprintf(buffer, "&#%d;", (int)*p);
5681 if (charmaptranslate_makespace(&res, &str,
5682 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5683 goto onError;
5684 for (cp = buffer; *cp; ++cp)
5685 *str++ = *cp;
5686 }
5687 p = collend;
5688 break;
5689 default:
5690 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5691 reason, startp, size, &exc,
5692 collstart-startp, collend-startp, &newpos);
5693 if (repunicode == NULL)
5694 goto onError;
5695 /* generate replacement */
5696 repsize = PyUnicode_GET_SIZE(repunicode);
5697 if (charmaptranslate_makespace(&res, &str,
5698 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5699 Py_DECREF(repunicode);
5700 goto onError;
5701 }
5702 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5703 *str++ = *uni2;
5704 p = startp + newpos;
5705 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005707 }
5708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 /* Resize if we allocated to much */
5710 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005711 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 if (PyUnicode_Resize(&res, respos) < 0)
5713 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005714 }
5715 Py_XDECREF(exc);
5716 Py_XDECREF(errorHandler);
5717 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 Py_XDECREF(res);
5721 Py_XDECREF(exc);
5722 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 return NULL;
5724}
5725
5726PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 PyObject *mapping,
5728 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729{
5730 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 str = PyUnicode_FromObject(str);
5733 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 PyUnicode_GET_SIZE(str),
5737 mapping,
5738 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 Py_DECREF(str);
5740 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005741
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 Py_XDECREF(str);
5744 return NULL;
5745}
Tim Petersced69f82003-09-16 20:30:58 +00005746
Guido van Rossum9e896b32000-04-05 20:11:21 +00005747/* --- Decimal Encoder ---------------------------------------------------- */
5748
5749int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 Py_ssize_t length,
5751 char *output,
5752 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005753{
5754 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 PyObject *errorHandler = NULL;
5756 PyObject *exc = NULL;
5757 const char *encoding = "decimal";
5758 const char *reason = "invalid decimal Unicode string";
5759 /* the following variable is used for caching string comparisons
5760 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5761 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005762
5763 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 PyErr_BadArgument();
5765 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005766 }
5767
5768 p = s;
5769 end = s + length;
5770 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 register Py_UNICODE ch = *p;
5772 int decimal;
5773 PyObject *repunicode;
5774 Py_ssize_t repsize;
5775 Py_ssize_t newpos;
5776 Py_UNICODE *uni2;
5777 Py_UNICODE *collstart;
5778 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005781 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 ++p;
5783 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005784 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 decimal = Py_UNICODE_TODECIMAL(ch);
5786 if (decimal >= 0) {
5787 *output++ = '0' + decimal;
5788 ++p;
5789 continue;
5790 }
5791 if (0 < ch && ch < 256) {
5792 *output++ = (char)ch;
5793 ++p;
5794 continue;
5795 }
5796 /* All other characters are considered unencodable */
5797 collstart = p;
5798 collend = p+1;
5799 while (collend < end) {
5800 if ((0 < *collend && *collend < 256) ||
5801 !Py_UNICODE_ISSPACE(*collend) ||
5802 Py_UNICODE_TODECIMAL(*collend))
5803 break;
5804 }
5805 /* cache callback name lookup
5806 * (if not done yet, i.e. it's the first error) */
5807 if (known_errorHandler==-1) {
5808 if ((errors==NULL) || (!strcmp(errors, "strict")))
5809 known_errorHandler = 1;
5810 else if (!strcmp(errors, "replace"))
5811 known_errorHandler = 2;
5812 else if (!strcmp(errors, "ignore"))
5813 known_errorHandler = 3;
5814 else if (!strcmp(errors, "xmlcharrefreplace"))
5815 known_errorHandler = 4;
5816 else
5817 known_errorHandler = 0;
5818 }
5819 switch (known_errorHandler) {
5820 case 1: /* strict */
5821 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5822 goto onError;
5823 case 2: /* replace */
5824 for (p = collstart; p < collend; ++p)
5825 *output++ = '?';
5826 /* fall through */
5827 case 3: /* ignore */
5828 p = collend;
5829 break;
5830 case 4: /* xmlcharrefreplace */
5831 /* generate replacement (temporarily (mis)uses p) */
5832 for (p = collstart; p < collend; ++p)
5833 output += sprintf(output, "&#%d;", (int)*p);
5834 p = collend;
5835 break;
5836 default:
5837 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5838 encoding, reason, s, length, &exc,
5839 collstart-s, collend-s, &newpos);
5840 if (repunicode == NULL)
5841 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005842 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005843 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005844 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5845 Py_DECREF(repunicode);
5846 goto onError;
5847 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 /* generate replacement */
5849 repsize = PyUnicode_GET_SIZE(repunicode);
5850 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5851 Py_UNICODE ch = *uni2;
5852 if (Py_UNICODE_ISSPACE(ch))
5853 *output++ = ' ';
5854 else {
5855 decimal = Py_UNICODE_TODECIMAL(ch);
5856 if (decimal >= 0)
5857 *output++ = '0' + decimal;
5858 else if (0 < ch && ch < 256)
5859 *output++ = (char)ch;
5860 else {
5861 Py_DECREF(repunicode);
5862 raise_encode_exception(&exc, encoding,
5863 s, length, collstart-s, collend-s, reason);
5864 goto onError;
5865 }
5866 }
5867 }
5868 p = s + newpos;
5869 Py_DECREF(repunicode);
5870 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005871 }
5872 /* 0-terminate the output string */
5873 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 Py_XDECREF(exc);
5875 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005876 return 0;
5877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 Py_XDECREF(exc);
5880 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005881 return -1;
5882}
5883
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884/* --- Helpers ------------------------------------------------------------ */
5885
Eric Smith8c663262007-08-25 02:26:07 +00005886#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005887#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005888
Thomas Wouters477c8d52006-05-27 19:21:47 +00005889#include "stringlib/count.h"
5890#include "stringlib/find.h"
5891#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005892#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005893
Eric Smith5807c412008-05-11 21:00:57 +00005894#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005895#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005896#include "stringlib/localeutil.h"
5897
Thomas Wouters477c8d52006-05-27 19:21:47 +00005898/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005899#define ADJUST_INDICES(start, end, len) \
5900 if (end > len) \
5901 end = len; \
5902 else if (end < 0) { \
5903 end += len; \
5904 if (end < 0) \
5905 end = 0; \
5906 } \
5907 if (start < 0) { \
5908 start += len; \
5909 if (start < 0) \
5910 start = 0; \
5911 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005912
Martin v. Löwis18e16552006-02-15 17:27:45 +00005913Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005914 PyObject *substr,
5915 Py_ssize_t start,
5916 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005918 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005919 PyUnicodeObject* str_obj;
5920 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005921
Thomas Wouters477c8d52006-05-27 19:21:47 +00005922 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5923 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005925 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5926 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 Py_DECREF(str_obj);
5928 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 }
Tim Petersced69f82003-09-16 20:30:58 +00005930
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005931 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005932 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005933 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5934 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005935 );
5936
5937 Py_DECREF(sub_obj);
5938 Py_DECREF(str_obj);
5939
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 return result;
5941}
5942
Martin v. Löwis18e16552006-02-15 17:27:45 +00005943Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005944 PyObject *sub,
5945 Py_ssize_t start,
5946 Py_ssize_t end,
5947 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005949 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005950
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005952 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954 sub = PyUnicode_FromObject(sub);
5955 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 Py_DECREF(str);
5957 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 }
Tim Petersced69f82003-09-16 20:30:58 +00005959
Thomas Wouters477c8d52006-05-27 19:21:47 +00005960 if (direction > 0)
5961 result = stringlib_find_slice(
5962 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5963 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5964 start, end
5965 );
5966 else
5967 result = stringlib_rfind_slice(
5968 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5969 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5970 start, end
5971 );
5972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005974 Py_DECREF(sub);
5975
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 return result;
5977}
5978
Tim Petersced69f82003-09-16 20:30:58 +00005979static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 PyUnicodeObject *substring,
5982 Py_ssize_t start,
5983 Py_ssize_t end,
5984 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 if (substring->length == 0)
5987 return 1;
5988
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005989 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 end -= substring->length;
5991 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
5994 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 if (Py_UNICODE_MATCH(self, end, substring))
5996 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 } else {
5998 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 }
6001
6002 return 0;
6003}
6004
Martin v. Löwis18e16552006-02-15 17:27:45 +00006005Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 PyObject *substr,
6007 Py_ssize_t start,
6008 Py_ssize_t end,
6009 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006011 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006012
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 str = PyUnicode_FromObject(str);
6014 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 substr = PyUnicode_FromObject(substr);
6017 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 Py_DECREF(str);
6019 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 }
Tim Petersced69f82003-09-16 20:30:58 +00006021
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 (PyUnicodeObject *)substr,
6024 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 Py_DECREF(str);
6026 Py_DECREF(substr);
6027 return result;
6028}
6029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030/* Apply fixfct filter to the Unicode object self and return a
6031 reference to the modified object */
6032
Tim Petersced69f82003-09-16 20:30:58 +00006033static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036{
6037
6038 PyUnicodeObject *u;
6039
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006040 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006043
6044 Py_UNICODE_COPY(u->str, self->str, self->length);
6045
Tim Peters7a29bd52001-09-12 03:03:31 +00006046 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 /* fixfct should return TRUE if it modified the buffer. If
6048 FALSE, return a reference to the original buffer instead
6049 (to save space, not time) */
6050 Py_INCREF(self);
6051 Py_DECREF(u);
6052 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 }
6054 return (PyObject*) u;
6055}
6056
Tim Petersced69f82003-09-16 20:30:58 +00006057static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058int fixupper(PyUnicodeObject *self)
6059{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006060 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 Py_UNICODE *s = self->str;
6062 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006066
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 ch = Py_UNICODE_TOUPPER(*s);
6068 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 *s = ch;
6071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 s++;
6073 }
6074
6075 return status;
6076}
6077
Tim Petersced69f82003-09-16 20:30:58 +00006078static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079int fixlower(PyUnicodeObject *self)
6080{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006081 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 Py_UNICODE *s = self->str;
6083 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006084
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006087
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 ch = Py_UNICODE_TOLOWER(*s);
6089 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 *s = ch;
6092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 s++;
6094 }
6095
6096 return status;
6097}
6098
Tim Petersced69f82003-09-16 20:30:58 +00006099static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100int fixswapcase(PyUnicodeObject *self)
6101{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006102 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 Py_UNICODE *s = self->str;
6104 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006105
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 while (len-- > 0) {
6107 if (Py_UNICODE_ISUPPER(*s)) {
6108 *s = Py_UNICODE_TOLOWER(*s);
6109 status = 1;
6110 } else if (Py_UNICODE_ISLOWER(*s)) {
6111 *s = Py_UNICODE_TOUPPER(*s);
6112 status = 1;
6113 }
6114 s++;
6115 }
6116
6117 return status;
6118}
6119
Tim Petersced69f82003-09-16 20:30:58 +00006120static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121int fixcapitalize(PyUnicodeObject *self)
6122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006123 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006124 Py_UNICODE *s = self->str;
6125 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006126
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006127 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006129 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 *s = Py_UNICODE_TOUPPER(*s);
6131 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006133 s++;
6134 while (--len > 0) {
6135 if (Py_UNICODE_ISUPPER(*s)) {
6136 *s = Py_UNICODE_TOLOWER(*s);
6137 status = 1;
6138 }
6139 s++;
6140 }
6141 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142}
6143
6144static
6145int fixtitle(PyUnicodeObject *self)
6146{
6147 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6148 register Py_UNICODE *e;
6149 int previous_is_cased;
6150
6151 /* Shortcut for single character strings */
6152 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6154 if (*p != ch) {
6155 *p = ch;
6156 return 1;
6157 }
6158 else
6159 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 }
Tim Petersced69f82003-09-16 20:30:58 +00006161
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 e = p + PyUnicode_GET_SIZE(self);
6163 previous_is_cased = 0;
6164 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006166
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 if (previous_is_cased)
6168 *p = Py_UNICODE_TOLOWER(ch);
6169 else
6170 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006171
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 if (Py_UNICODE_ISLOWER(ch) ||
6173 Py_UNICODE_ISUPPER(ch) ||
6174 Py_UNICODE_ISTITLE(ch))
6175 previous_is_cased = 1;
6176 else
6177 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 }
6179 return 1;
6180}
6181
Tim Peters8ce9f162004-08-27 01:49:32 +00006182PyObject *
6183PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184{
Skip Montanaro6543b452004-09-16 03:28:13 +00006185 const Py_UNICODE blank = ' ';
6186 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006187 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006188 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006189 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6190 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006191 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6192 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006193 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006194 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195
Tim Peters05eba1f2004-08-27 21:32:02 +00006196 fseq = PySequence_Fast(seq, "");
6197 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006198 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006199 }
6200
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006201 /* NOTE: the following code can't call back into Python code,
6202 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006203 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006204
Tim Peters05eba1f2004-08-27 21:32:02 +00006205 seqlen = PySequence_Fast_GET_SIZE(fseq);
6206 /* If empty sequence, return u"". */
6207 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006208 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6209 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006210 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006211 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006212 /* If singleton sequence with an exact Unicode, return that. */
6213 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 item = items[0];
6215 if (PyUnicode_CheckExact(item)) {
6216 Py_INCREF(item);
6217 res = (PyUnicodeObject *)item;
6218 goto Done;
6219 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006220 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006221 else {
6222 /* Set up sep and seplen */
6223 if (separator == NULL) {
6224 sep = &blank;
6225 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006226 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006227 else {
6228 if (!PyUnicode_Check(separator)) {
6229 PyErr_Format(PyExc_TypeError,
6230 "separator: expected str instance,"
6231 " %.80s found",
6232 Py_TYPE(separator)->tp_name);
6233 goto onError;
6234 }
6235 sep = PyUnicode_AS_UNICODE(separator);
6236 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006237 }
6238 }
6239
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006240 /* There are at least two things to join, or else we have a subclass
6241 * of str in the sequence.
6242 * Do a pre-pass to figure out the total amount of space we'll
6243 * need (sz), and see whether all argument are strings.
6244 */
6245 sz = 0;
6246 for (i = 0; i < seqlen; i++) {
6247 const Py_ssize_t old_sz = sz;
6248 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 if (!PyUnicode_Check(item)) {
6250 PyErr_Format(PyExc_TypeError,
6251 "sequence item %zd: expected str instance,"
6252 " %.80s found",
6253 i, Py_TYPE(item)->tp_name);
6254 goto onError;
6255 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006256 sz += PyUnicode_GET_SIZE(item);
6257 if (i != 0)
6258 sz += seplen;
6259 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6260 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006262 goto onError;
6263 }
6264 }
Tim Petersced69f82003-09-16 20:30:58 +00006265
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006266 res = _PyUnicode_New(sz);
6267 if (res == NULL)
6268 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006269
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006270 /* Catenate everything. */
6271 res_p = PyUnicode_AS_UNICODE(res);
6272 for (i = 0; i < seqlen; ++i) {
6273 Py_ssize_t itemlen;
6274 item = items[i];
6275 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 /* Copy item, and maybe the separator. */
6277 if (i) {
6278 Py_UNICODE_COPY(res_p, sep, seplen);
6279 res_p += seplen;
6280 }
6281 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6282 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006283 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006284
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006286 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 return (PyObject *)res;
6288
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006290 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006291 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 return NULL;
6293}
6294
Tim Petersced69f82003-09-16 20:30:58 +00006295static
6296PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 Py_ssize_t left,
6298 Py_ssize_t right,
6299 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300{
6301 PyUnicodeObject *u;
6302
6303 if (left < 0)
6304 left = 0;
6305 if (right < 0)
6306 right = 0;
6307
Tim Peters7a29bd52001-09-12 03:03:31 +00006308 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 Py_INCREF(self);
6310 return self;
6311 }
6312
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006313 if (left > PY_SSIZE_T_MAX - self->length ||
6314 right > PY_SSIZE_T_MAX - (left + self->length)) {
6315 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6316 return NULL;
6317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 u = _PyUnicode_New(left + self->length + right);
6319 if (u) {
6320 if (left)
6321 Py_UNICODE_FILL(u->str, fill, left);
6322 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6323 if (right)
6324 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6325 }
6326
6327 return u;
6328}
6329
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006330PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333
6334 string = PyUnicode_FromObject(string);
6335 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006338 list = stringlib_splitlines(
6339 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6340 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341
6342 Py_DECREF(string);
6343 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344}
6345
Tim Petersced69f82003-09-16 20:30:58 +00006346static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 PyUnicodeObject *substring,
6349 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006352 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006355 return stringlib_split_whitespace(
6356 (PyObject*) self, self->str, self->length, maxcount
6357 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006359 return stringlib_split(
6360 (PyObject*) self, self->str, self->length,
6361 substring->str, substring->length,
6362 maxcount
6363 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364}
6365
Tim Petersced69f82003-09-16 20:30:58 +00006366static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006367PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 PyUnicodeObject *substring,
6369 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006370{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006371 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006372 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006373
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006374 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006375 return stringlib_rsplit_whitespace(
6376 (PyObject*) self, self->str, self->length, maxcount
6377 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006378
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006379 return stringlib_rsplit(
6380 (PyObject*) self, self->str, self->length,
6381 substring->str, substring->length,
6382 maxcount
6383 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006384}
6385
6386static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 PyUnicodeObject *str1,
6389 PyUnicodeObject *str2,
6390 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391{
6392 PyUnicodeObject *u;
6393
6394 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006396 else if (maxcount == 0 || self->length == 0)
6397 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398
Thomas Wouters477c8d52006-05-27 19:21:47 +00006399 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006400 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006401 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006402 if (str1->length == 0)
6403 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006404 if (str1->length == 1) {
6405 /* replace characters */
6406 Py_UNICODE u1, u2;
6407 if (!findchar(self->str, self->length, str1->str[0]))
6408 goto nothing;
6409 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6410 if (!u)
6411 return NULL;
6412 Py_UNICODE_COPY(u->str, self->str, self->length);
6413 u1 = str1->str[0];
6414 u2 = str2->str[0];
6415 for (i = 0; i < u->length; i++)
6416 if (u->str[i] == u1) {
6417 if (--maxcount < 0)
6418 break;
6419 u->str[i] = u2;
6420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006422 i = stringlib_find(
6423 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006425 if (i < 0)
6426 goto nothing;
6427 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6428 if (!u)
6429 return NULL;
6430 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006431
6432 /* change everything in-place, starting with this one */
6433 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6434 i += str1->length;
6435
6436 while ( --maxcount > 0) {
6437 i = stringlib_find(self->str+i, self->length-i,
6438 str1->str, str1->length,
6439 i);
6440 if (i == -1)
6441 break;
6442 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6443 i += str1->length;
6444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006447
6448 Py_ssize_t n, i, j, e;
6449 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 Py_UNICODE *p;
6451
6452 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006453 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6454 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006455 if (n == 0)
6456 goto nothing;
6457 /* new_size = self->length + n * (str2->length - str1->length)); */
6458 delta = (str2->length - str1->length);
6459 if (delta == 0) {
6460 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006462 product = n * (str2->length - str1->length);
6463 if ((product / (str2->length - str1->length)) != n) {
6464 PyErr_SetString(PyExc_OverflowError,
6465 "replace string is too long");
6466 return NULL;
6467 }
6468 new_size = self->length + product;
6469 if (new_size < 0) {
6470 PyErr_SetString(PyExc_OverflowError,
6471 "replace string is too long");
6472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 }
6474 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006475 u = _PyUnicode_New(new_size);
6476 if (!u)
6477 return NULL;
6478 i = 0;
6479 p = u->str;
6480 e = self->length - str1->length;
6481 if (str1->length > 0) {
6482 while (n-- > 0) {
6483 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006484 j = stringlib_find(self->str+i, self->length-i,
6485 str1->str, str1->length,
6486 i);
6487 if (j == -1)
6488 break;
6489 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006490 /* copy unchanged part [i:j] */
6491 Py_UNICODE_COPY(p, self->str+i, j-i);
6492 p += j - i;
6493 }
6494 /* copy substitution string */
6495 if (str2->length > 0) {
6496 Py_UNICODE_COPY(p, str2->str, str2->length);
6497 p += str2->length;
6498 }
6499 i = j + str1->length;
6500 }
6501 if (i < self->length)
6502 /* copy tail [i:] */
6503 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6504 } else {
6505 /* interleave */
6506 while (n > 0) {
6507 Py_UNICODE_COPY(p, str2->str, str2->length);
6508 p += str2->length;
6509 if (--n <= 0)
6510 break;
6511 *p++ = self->str[i++];
6512 }
6513 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006517
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006519 /* nothing to replace; return original string (when possible) */
6520 if (PyUnicode_CheckExact(self)) {
6521 Py_INCREF(self);
6522 return (PyObject *) self;
6523 }
6524 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525}
6526
6527/* --- Unicode Object Methods --------------------------------------------- */
6528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006529PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531\n\
6532Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006533characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534
6535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006536unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 return fixup(self, fixtitle);
6539}
6540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006541PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543\n\
6544Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006545have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
6547static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006548unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 return fixup(self, fixcapitalize);
6551}
6552
6553#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006554PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556\n\
6557Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006558normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559
6560static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006561unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562{
6563 PyObject *list;
6564 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006565 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 /* Split into words */
6568 list = split(self, NULL, -1);
6569 if (!list)
6570 return NULL;
6571
6572 /* Capitalize each word */
6573 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6574 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 if (item == NULL)
6577 goto onError;
6578 Py_DECREF(PyList_GET_ITEM(list, i));
6579 PyList_SET_ITEM(list, i, item);
6580 }
6581
6582 /* Join the words to form a new string */
6583 item = PyUnicode_Join(NULL, list);
6584
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 Py_DECREF(list);
6587 return (PyObject *)item;
6588}
6589#endif
6590
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006591/* Argument converter. Coerces to a single unicode character */
6592
6593static int
6594convert_uc(PyObject *obj, void *addr)
6595{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006596 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6597 PyObject *uniobj;
6598 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006599
Benjamin Peterson14339b62009-01-31 16:36:08 +00006600 uniobj = PyUnicode_FromObject(obj);
6601 if (uniobj == NULL) {
6602 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006604 return 0;
6605 }
6606 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6607 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006609 Py_DECREF(uniobj);
6610 return 0;
6611 }
6612 unistr = PyUnicode_AS_UNICODE(uniobj);
6613 *fillcharloc = unistr[0];
6614 Py_DECREF(uniobj);
6615 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006616}
6617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006618PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006621Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006622done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
6624static PyObject *
6625unicode_center(PyUnicodeObject *self, PyObject *args)
6626{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006627 Py_ssize_t marg, left;
6628 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006629 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630
Thomas Woutersde017742006-02-16 19:34:37 +00006631 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 return NULL;
6633
Tim Peters7a29bd52001-09-12 03:03:31 +00006634 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 Py_INCREF(self);
6636 return (PyObject*) self;
6637 }
6638
6639 marg = width - self->length;
6640 left = marg / 2 + (marg & width & 1);
6641
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006642 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643}
6644
Marc-André Lemburge5034372000-08-08 08:04:29 +00006645#if 0
6646
6647/* This code should go into some future Unicode collation support
6648 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006649 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006650
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006651/* speedy UTF-16 code point order comparison */
6652/* gleaned from: */
6653/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6654
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006655static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006656{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006657 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006658 0, 0, 0, 0, 0, 0, 0, 0,
6659 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006660 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006661};
6662
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663static int
6664unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6665{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006666 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006667
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 Py_UNICODE *s1 = str1->str;
6669 Py_UNICODE *s2 = str2->str;
6670
6671 len1 = str1->length;
6672 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006673
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006675 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006676
6677 c1 = *s1++;
6678 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006679
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 if (c1 > (1<<11) * 26)
6681 c1 += utf16Fixup[c1>>11];
6682 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006683 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006684 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006685
6686 if (c1 != c2)
6687 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006688
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006689 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 }
6691
6692 return (len1 < len2) ? -1 : (len1 != len2);
6693}
6694
Marc-André Lemburge5034372000-08-08 08:04:29 +00006695#else
6696
6697static int
6698unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6699{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006700 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006701
6702 Py_UNICODE *s1 = str1->str;
6703 Py_UNICODE *s2 = str2->str;
6704
6705 len1 = str1->length;
6706 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006707
Marc-André Lemburge5034372000-08-08 08:04:29 +00006708 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006709 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006710
Fredrik Lundh45714e92001-06-26 16:39:36 +00006711 c1 = *s1++;
6712 c2 = *s2++;
6713
6714 if (c1 != c2)
6715 return (c1 < c2) ? -1 : 1;
6716
Marc-André Lemburge5034372000-08-08 08:04:29 +00006717 len1--; len2--;
6718 }
6719
6720 return (len1 < len2) ? -1 : (len1 != len2);
6721}
6722
6723#endif
6724
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006728 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6729 return unicode_compare((PyUnicodeObject *)left,
6730 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006731 PyErr_Format(PyExc_TypeError,
6732 "Can't compare %.100s and %.100s",
6733 left->ob_type->tp_name,
6734 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 return -1;
6736}
6737
Martin v. Löwis5b222132007-06-10 09:51:05 +00006738int
6739PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6740{
6741 int i;
6742 Py_UNICODE *id;
6743 assert(PyUnicode_Check(uni));
6744 id = PyUnicode_AS_UNICODE(uni);
6745 /* Compare Unicode string and source character set string */
6746 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 if (id[i] != str[i])
6748 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006749 /* This check keeps Python strings that end in '\0' from comparing equal
6750 to C strings identical up to that point. */
6751 if (PyUnicode_GET_SIZE(uni) != i)
6752 /* We'll say the Python string is longer. */
6753 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006754 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006756 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006758 return 0;
6759}
6760
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006761
Benjamin Peterson29060642009-01-31 22:14:21 +00006762#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006763 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006764
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006765PyObject *PyUnicode_RichCompare(PyObject *left,
6766 PyObject *right,
6767 int op)
6768{
6769 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006770
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006771 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6772 PyObject *v;
6773 if (((PyUnicodeObject *) left)->length !=
6774 ((PyUnicodeObject *) right)->length) {
6775 if (op == Py_EQ) {
6776 Py_INCREF(Py_False);
6777 return Py_False;
6778 }
6779 if (op == Py_NE) {
6780 Py_INCREF(Py_True);
6781 return Py_True;
6782 }
6783 }
6784 if (left == right)
6785 result = 0;
6786 else
6787 result = unicode_compare((PyUnicodeObject *)left,
6788 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006789
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006790 /* Convert the return value to a Boolean */
6791 switch (op) {
6792 case Py_EQ:
6793 v = TEST_COND(result == 0);
6794 break;
6795 case Py_NE:
6796 v = TEST_COND(result != 0);
6797 break;
6798 case Py_LE:
6799 v = TEST_COND(result <= 0);
6800 break;
6801 case Py_GE:
6802 v = TEST_COND(result >= 0);
6803 break;
6804 case Py_LT:
6805 v = TEST_COND(result == -1);
6806 break;
6807 case Py_GT:
6808 v = TEST_COND(result == 1);
6809 break;
6810 default:
6811 PyErr_BadArgument();
6812 return NULL;
6813 }
6814 Py_INCREF(v);
6815 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006816 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006817
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006818 Py_INCREF(Py_NotImplemented);
6819 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006820}
6821
Guido van Rossum403d68b2000-03-13 15:55:09 +00006822int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006824{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006825 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006826 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006827
6828 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006829 sub = PyUnicode_FromObject(element);
6830 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 PyErr_Format(PyExc_TypeError,
6832 "'in <string>' requires string as left operand, not %s",
6833 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006834 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006835 }
6836
Thomas Wouters477c8d52006-05-27 19:21:47 +00006837 str = PyUnicode_FromObject(container);
6838 if (!str) {
6839 Py_DECREF(sub);
6840 return -1;
6841 }
6842
6843 result = stringlib_contains_obj(str, sub);
6844
6845 Py_DECREF(str);
6846 Py_DECREF(sub);
6847
Guido van Rossum403d68b2000-03-13 15:55:09 +00006848 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006849}
6850
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851/* Concat to string or Unicode object giving a new Unicode object. */
6852
6853PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855{
6856 PyUnicodeObject *u = NULL, *v = NULL, *w;
6857
6858 /* Coerce the two arguments */
6859 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6860 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6863 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865
6866 /* Shortcuts */
6867 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 Py_DECREF(v);
6869 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 }
6871 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 Py_DECREF(u);
6873 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 }
6875
6876 /* Concat the two Unicode strings */
6877 w = _PyUnicode_New(u->length + v->length);
6878 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 Py_UNICODE_COPY(w->str, u->str, u->length);
6881 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6882
6883 Py_DECREF(u);
6884 Py_DECREF(v);
6885 return (PyObject *)w;
6886
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 Py_XDECREF(u);
6889 Py_XDECREF(v);
6890 return NULL;
6891}
6892
Walter Dörwald1ab83302007-05-18 17:15:44 +00006893void
6894PyUnicode_Append(PyObject **pleft, PyObject *right)
6895{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006896 PyObject *new;
6897 if (*pleft == NULL)
6898 return;
6899 if (right == NULL || !PyUnicode_Check(*pleft)) {
6900 Py_DECREF(*pleft);
6901 *pleft = NULL;
6902 return;
6903 }
6904 new = PyUnicode_Concat(*pleft, right);
6905 Py_DECREF(*pleft);
6906 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006907}
6908
6909void
6910PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6911{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006912 PyUnicode_Append(pleft, right);
6913 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006914}
6915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006916PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006919Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006920string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006921interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922
6923static PyObject *
6924unicode_count(PyUnicodeObject *self, PyObject *args)
6925{
6926 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006927 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006928 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 PyObject *result;
6930
Guido van Rossumb8872e62000-05-09 14:14:27 +00006931 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 return NULL;
6934
6935 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006936 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006939
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006940 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006941 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006942 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006943 substring->str, substring->length,
6944 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00006945 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
6947 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006948
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 return result;
6950}
6951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006952PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006955Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006956to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006957handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006958a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6959'xmlcharrefreplace' as well as any other name registered with\n\
6960codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961
6962static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00006963unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964{
Benjamin Peterson308d6372009-09-18 21:42:35 +00006965 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 char *encoding = NULL;
6967 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006968 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006969
Benjamin Peterson308d6372009-09-18 21:42:35 +00006970 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6971 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006973 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006974 if (v == NULL)
6975 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006976 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006977 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006978 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006979 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006980 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006981 Py_DECREF(v);
6982 return NULL;
6983 }
6984 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006985
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006987 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006988}
6989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006990PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992\n\
6993Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006994If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995
6996static PyObject*
6997unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6998{
6999 Py_UNICODE *e;
7000 Py_UNICODE *p;
7001 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007002 Py_UNICODE *qe;
7003 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 PyUnicodeObject *u;
7005 int tabsize = 8;
7006
7007 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009
Thomas Wouters7e474022000-07-16 12:04:32 +00007010 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007011 i = 0; /* chars up to and including most recent \n or \r */
7012 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7013 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 for (p = self->str; p < e; p++)
7015 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 if (tabsize > 0) {
7017 incr = tabsize - (j % tabsize); /* cannot overflow */
7018 if (j > PY_SSIZE_T_MAX - incr)
7019 goto overflow1;
7020 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007021 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 if (j > PY_SSIZE_T_MAX - 1)
7025 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 j++;
7027 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 if (i > PY_SSIZE_T_MAX - j)
7029 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007031 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 }
7033 }
7034
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007035 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007037
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 /* Second pass: create output string and fill it */
7039 u = _PyUnicode_New(i + j);
7040 if (!u)
7041 return NULL;
7042
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007043 j = 0; /* same as in first pass */
7044 q = u->str; /* next output char */
7045 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046
7047 for (p = self->str; p < e; p++)
7048 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 if (tabsize > 0) {
7050 i = tabsize - (j % tabsize);
7051 j += i;
7052 while (i--) {
7053 if (q >= qe)
7054 goto overflow2;
7055 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 else {
7060 if (q >= qe)
7061 goto overflow2;
7062 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007063 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 if (*p == '\n' || *p == '\r')
7065 j = 0;
7066 }
7067
7068 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007069
7070 overflow2:
7071 Py_DECREF(u);
7072 overflow1:
7073 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075}
7076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007077PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079\n\
7080Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007081such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082arguments start and end are interpreted as in slice notation.\n\
7083\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007084Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085
7086static PyObject *
7087unicode_find(PyUnicodeObject *self, PyObject *args)
7088{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007089 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007090 Py_ssize_t start;
7091 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007092 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093
Christian Heimes9cd17752007-11-18 19:35:23 +00007094 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096
Thomas Wouters477c8d52006-05-27 19:21:47 +00007097 result = stringlib_find_slice(
7098 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7099 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7100 start, end
7101 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102
7103 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007104
Christian Heimes217cfd12007-12-02 14:31:20 +00007105 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106}
7107
7108static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007109unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110{
7111 if (index < 0 || index >= self->length) {
7112 PyErr_SetString(PyExc_IndexError, "string index out of range");
7113 return NULL;
7114 }
7115
7116 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7117}
7118
Guido van Rossumc2504932007-09-18 19:42:40 +00007119/* Believe it or not, this produces the same value for ASCII strings
7120 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007122unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123{
Guido van Rossumc2504932007-09-18 19:42:40 +00007124 Py_ssize_t len;
7125 Py_UNICODE *p;
7126 long x;
7127
7128 if (self->hash != -1)
7129 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007130 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007131 p = self->str;
7132 x = *p << 7;
7133 while (--len >= 0)
7134 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007135 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007136 if (x == -1)
7137 x = -2;
7138 self->hash = x;
7139 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140}
7141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007142PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007145Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
7147static PyObject *
7148unicode_index(PyUnicodeObject *self, PyObject *args)
7149{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007150 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007151 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007152 Py_ssize_t start;
7153 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154
Christian Heimes9cd17752007-11-18 19:35:23 +00007155 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
Thomas Wouters477c8d52006-05-27 19:21:47 +00007158 result = stringlib_find_slice(
7159 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7160 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7161 start, end
7162 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163
7164 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007165
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 if (result < 0) {
7167 PyErr_SetString(PyExc_ValueError, "substring not found");
7168 return NULL;
7169 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007170
Christian Heimes217cfd12007-12-02 14:31:20 +00007171 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172}
7173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007174PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007177Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007178at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179
7180static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007181unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182{
7183 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7184 register const Py_UNICODE *e;
7185 int cased;
7186
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 /* Shortcut for single character strings */
7188 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007191 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007192 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007194
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 e = p + PyUnicode_GET_SIZE(self);
7196 cased = 0;
7197 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007199
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7201 return PyBool_FromLong(0);
7202 else if (!cased && Py_UNICODE_ISLOWER(ch))
7203 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007205 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206}
7207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007208PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007211Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007212at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
7214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007215unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216{
7217 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7218 register const Py_UNICODE *e;
7219 int cased;
7220
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 /* Shortcut for single character strings */
7222 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007225 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007226 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007228
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 e = p + PyUnicode_GET_SIZE(self);
7230 cased = 0;
7231 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007233
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7235 return PyBool_FromLong(0);
7236 else if (!cased && Py_UNICODE_ISUPPER(ch))
7237 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007239 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240}
7241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007242PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007245Return True if S is a titlecased string and there is at least one\n\
7246character in S, i.e. upper- and titlecase characters may only\n\
7247follow uncased characters and lowercase characters only cased ones.\n\
7248Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249
7250static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007251unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252{
7253 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7254 register const Py_UNICODE *e;
7255 int cased, previous_is_cased;
7256
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 /* Shortcut for single character strings */
7258 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7260 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007262 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007263 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007265
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 e = p + PyUnicode_GET_SIZE(self);
7267 cased = 0;
7268 previous_is_cased = 0;
7269 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007271
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7273 if (previous_is_cased)
7274 return PyBool_FromLong(0);
7275 previous_is_cased = 1;
7276 cased = 1;
7277 }
7278 else if (Py_UNICODE_ISLOWER(ch)) {
7279 if (!previous_is_cased)
7280 return PyBool_FromLong(0);
7281 previous_is_cased = 1;
7282 cased = 1;
7283 }
7284 else
7285 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007287 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288}
7289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007290PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007293Return True if all characters in S are whitespace\n\
7294and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295
7296static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007297unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298{
7299 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7300 register const Py_UNICODE *e;
7301
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 /* Shortcut for single character strings */
7303 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 Py_UNICODE_ISSPACE(*p))
7305 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007307 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007308 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007310
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 e = p + PyUnicode_GET_SIZE(self);
7312 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 if (!Py_UNICODE_ISSPACE(*p))
7314 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007316 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317}
7318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007319PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007321\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007322Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007323and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007324
7325static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007326unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007327{
7328 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7329 register const Py_UNICODE *e;
7330
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007331 /* Shortcut for single character strings */
7332 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 Py_UNICODE_ISALPHA(*p))
7334 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007335
7336 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007337 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007339
7340 e = p + PyUnicode_GET_SIZE(self);
7341 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 if (!Py_UNICODE_ISALPHA(*p))
7343 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007344 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007345 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007346}
7347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007348PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007350\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007351Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007352and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007353
7354static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007355unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007356{
7357 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7358 register const Py_UNICODE *e;
7359
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007360 /* Shortcut for single character strings */
7361 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 Py_UNICODE_ISALNUM(*p))
7363 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007364
7365 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007366 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007368
7369 e = p + PyUnicode_GET_SIZE(self);
7370 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 if (!Py_UNICODE_ISALNUM(*p))
7372 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007373 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007374 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007375}
7376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007377PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007380Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007381False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382
7383static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007384unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385{
7386 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7387 register const Py_UNICODE *e;
7388
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 /* Shortcut for single character strings */
7390 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 Py_UNICODE_ISDECIMAL(*p))
7392 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007394 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007395 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007397
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398 e = p + PyUnicode_GET_SIZE(self);
7399 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 if (!Py_UNICODE_ISDECIMAL(*p))
7401 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007403 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404}
7405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007406PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007409Return True if all characters in S are digits\n\
7410and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
7412static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007413unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414{
7415 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7416 register const Py_UNICODE *e;
7417
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 /* Shortcut for single character strings */
7419 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 Py_UNICODE_ISDIGIT(*p))
7421 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007423 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007424 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007426
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427 e = p + PyUnicode_GET_SIZE(self);
7428 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 if (!Py_UNICODE_ISDIGIT(*p))
7430 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007432 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433}
7434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007435PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007438Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007439False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
7441static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007442unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443{
7444 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7445 register const Py_UNICODE *e;
7446
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 /* Shortcut for single character strings */
7448 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 Py_UNICODE_ISNUMERIC(*p))
7450 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007452 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007453 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007455
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 e = p + PyUnicode_GET_SIZE(self);
7457 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 if (!Py_UNICODE_ISNUMERIC(*p))
7459 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007461 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462}
7463
Martin v. Löwis47383402007-08-15 07:32:56 +00007464int
7465PyUnicode_IsIdentifier(PyObject *self)
7466{
7467 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7468 register const Py_UNICODE *e;
7469
7470 /* Special case for empty strings */
7471 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007473
7474 /* PEP 3131 says that the first character must be in
7475 XID_Start and subsequent characters in XID_Continue,
7476 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007477 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007478 letters, digits, underscore). However, given the current
7479 definition of XID_Start and XID_Continue, it is sufficient
7480 to check just for these, except that _ must be allowed
7481 as starting an identifier. */
7482 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7483 return 0;
7484
7485 e = p + PyUnicode_GET_SIZE(self);
7486 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 if (!_PyUnicode_IsXidContinue(*p))
7488 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007489 }
7490 return 1;
7491}
7492
7493PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007495\n\
7496Return True if S is a valid identifier according\n\
7497to the language definition.");
7498
7499static PyObject*
7500unicode_isidentifier(PyObject *self)
7501{
7502 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7503}
7504
Georg Brandl559e5d72008-06-11 18:37:52 +00007505PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007507\n\
7508Return True if all characters in S are considered\n\
7509printable in repr() or S is empty, False otherwise.");
7510
7511static PyObject*
7512unicode_isprintable(PyObject *self)
7513{
7514 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7515 register const Py_UNICODE *e;
7516
7517 /* Shortcut for single character strings */
7518 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7519 Py_RETURN_TRUE;
7520 }
7521
7522 e = p + PyUnicode_GET_SIZE(self);
7523 for (; p < e; p++) {
7524 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7525 Py_RETURN_FALSE;
7526 }
7527 }
7528 Py_RETURN_TRUE;
7529}
7530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007531PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007532 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533\n\
7534Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007535iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536
7537static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007538unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007540 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541}
7542
Martin v. Löwis18e16552006-02-15 17:27:45 +00007543static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544unicode_length(PyUnicodeObject *self)
7545{
7546 return self->length;
7547}
7548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007549PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007552Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007553done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554
7555static PyObject *
7556unicode_ljust(PyUnicodeObject *self, PyObject *args)
7557{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007558 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007559 Py_UNICODE fillchar = ' ';
7560
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007561 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 return NULL;
7563
Tim Peters7a29bd52001-09-12 03:03:31 +00007564 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 Py_INCREF(self);
7566 return (PyObject*) self;
7567 }
7568
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007569 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570}
7571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007572PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007575Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
7577static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007578unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 return fixup(self, fixlower);
7581}
7582
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007583#define LEFTSTRIP 0
7584#define RIGHTSTRIP 1
7585#define BOTHSTRIP 2
7586
7587/* Arrays indexed by above */
7588static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7589
7590#define STRIPNAME(i) (stripformat[i]+3)
7591
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007592/* externally visible for str.strip(unicode) */
7593PyObject *
7594_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7595{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007596 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7597 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7598 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7599 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7600 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007601
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007603
Benjamin Peterson14339b62009-01-31 16:36:08 +00007604 i = 0;
7605 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7607 i++;
7608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007609 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007610
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 j = len;
7612 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 do {
7614 j--;
7615 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7616 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007617 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007618
Benjamin Peterson14339b62009-01-31 16:36:08 +00007619 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 Py_INCREF(self);
7621 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007622 }
7623 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007625}
7626
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
7628static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007629do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007631 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7632 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007633
Benjamin Peterson14339b62009-01-31 16:36:08 +00007634 i = 0;
7635 if (striptype != RIGHTSTRIP) {
7636 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7637 i++;
7638 }
7639 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007640
Benjamin Peterson14339b62009-01-31 16:36:08 +00007641 j = len;
7642 if (striptype != LEFTSTRIP) {
7643 do {
7644 j--;
7645 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7646 j++;
7647 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007648
Benjamin Peterson14339b62009-01-31 16:36:08 +00007649 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7650 Py_INCREF(self);
7651 return (PyObject*)self;
7652 }
7653 else
7654 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655}
7656
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007657
7658static PyObject *
7659do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7660{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007661 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007662
Benjamin Peterson14339b62009-01-31 16:36:08 +00007663 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7664 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007665
Benjamin Peterson14339b62009-01-31 16:36:08 +00007666 if (sep != NULL && sep != Py_None) {
7667 if (PyUnicode_Check(sep))
7668 return _PyUnicode_XStrip(self, striptype, sep);
7669 else {
7670 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 "%s arg must be None or str",
7672 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007673 return NULL;
7674 }
7675 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007676
Benjamin Peterson14339b62009-01-31 16:36:08 +00007677 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007678}
7679
7680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007681PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007683\n\
7684Return a copy of the string S with leading and trailing\n\
7685whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007686If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007687
7688static PyObject *
7689unicode_strip(PyUnicodeObject *self, PyObject *args)
7690{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007691 if (PyTuple_GET_SIZE(args) == 0)
7692 return do_strip(self, BOTHSTRIP); /* Common case */
7693 else
7694 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007695}
7696
7697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007698PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007700\n\
7701Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007702If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007703
7704static PyObject *
7705unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7706{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007707 if (PyTuple_GET_SIZE(args) == 0)
7708 return do_strip(self, LEFTSTRIP); /* Common case */
7709 else
7710 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007711}
7712
7713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007714PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007716\n\
7717Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007718If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007719
7720static PyObject *
7721unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7722{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007723 if (PyTuple_GET_SIZE(args) == 0)
7724 return do_strip(self, RIGHTSTRIP); /* Common case */
7725 else
7726 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007727}
7728
7729
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007731unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732{
7733 PyUnicodeObject *u;
7734 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007735 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007736 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
Georg Brandl222de0f2009-04-12 12:01:50 +00007738 if (len < 1) {
7739 Py_INCREF(unicode_empty);
7740 return (PyObject *)unicode_empty;
7741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742
Tim Peters7a29bd52001-09-12 03:03:31 +00007743 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 /* no repeat, return original string */
7745 Py_INCREF(str);
7746 return (PyObject*) str;
7747 }
Tim Peters8f422462000-09-09 06:13:41 +00007748
7749 /* ensure # of chars needed doesn't overflow int and # of bytes
7750 * needed doesn't overflow size_t
7751 */
7752 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007753 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007754 PyErr_SetString(PyExc_OverflowError,
7755 "repeated string is too long");
7756 return NULL;
7757 }
7758 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7759 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7760 PyErr_SetString(PyExc_OverflowError,
7761 "repeated string is too long");
7762 return NULL;
7763 }
7764 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 if (!u)
7766 return NULL;
7767
7768 p = u->str;
7769
Georg Brandl222de0f2009-04-12 12:01:50 +00007770 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007771 Py_UNICODE_FILL(p, str->str[0], len);
7772 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007773 Py_ssize_t done = str->length; /* number of characters copied this far */
7774 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007776 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007777 Py_UNICODE_COPY(p+done, p, n);
7778 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 }
7781
7782 return (PyObject*) u;
7783}
7784
7785PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 PyObject *subobj,
7787 PyObject *replobj,
7788 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789{
7790 PyObject *self;
7791 PyObject *str1;
7792 PyObject *str2;
7793 PyObject *result;
7794
7795 self = PyUnicode_FromObject(obj);
7796 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 str1 = PyUnicode_FromObject(subobj);
7799 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 Py_DECREF(self);
7801 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 }
7803 str2 = PyUnicode_FromObject(replobj);
7804 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 Py_DECREF(self);
7806 Py_DECREF(str1);
7807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 }
Tim Petersced69f82003-09-16 20:30:58 +00007809 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 (PyUnicodeObject *)str1,
7811 (PyUnicodeObject *)str2,
7812 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813 Py_DECREF(self);
7814 Py_DECREF(str1);
7815 Py_DECREF(str2);
7816 return result;
7817}
7818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007819PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821\n\
7822Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007823old replaced by new. If the optional argument count is\n\
7824given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825
7826static PyObject*
7827unicode_replace(PyUnicodeObject *self, PyObject *args)
7828{
7829 PyUnicodeObject *str1;
7830 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007831 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 PyObject *result;
7833
Martin v. Löwis18e16552006-02-15 17:27:45 +00007834 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 return NULL;
7836 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7837 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007840 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 Py_DECREF(str1);
7842 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844
7845 result = replace(self, str1, str2, maxcount);
7846
7847 Py_DECREF(str1);
7848 Py_DECREF(str2);
7849 return result;
7850}
7851
7852static
7853PyObject *unicode_repr(PyObject *unicode)
7854{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007855 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007856 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007857 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7858 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7859
7860 /* XXX(nnorwitz): rather than over-allocating, it would be
7861 better to choose a different scheme. Perhaps scan the
7862 first N-chars of the string and allocate based on that size.
7863 */
7864 /* Initial allocation is based on the longest-possible unichr
7865 escape.
7866
7867 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7868 unichr, so in this case it's the longest unichr escape. In
7869 narrow (UTF-16) builds this is five chars per source unichr
7870 since there are two unichrs in the surrogate pair, so in narrow
7871 (UTF-16) builds it's not the longest unichr escape.
7872
7873 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7874 so in the narrow (UTF-16) build case it's the longest unichr
7875 escape.
7876 */
7877
Walter Dörwald1ab83302007-05-18 17:15:44 +00007878 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007880#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007882#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007884#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007886 if (repr == NULL)
7887 return NULL;
7888
Walter Dörwald1ab83302007-05-18 17:15:44 +00007889 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007890
7891 /* Add quote */
7892 *p++ = (findchar(s, size, '\'') &&
7893 !findchar(s, size, '"')) ? '"' : '\'';
7894 while (size-- > 0) {
7895 Py_UNICODE ch = *s++;
7896
7897 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007898 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007899 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007900 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007901 continue;
7902 }
7903
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007905 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007906 *p++ = '\\';
7907 *p++ = 't';
7908 }
7909 else if (ch == '\n') {
7910 *p++ = '\\';
7911 *p++ = 'n';
7912 }
7913 else if (ch == '\r') {
7914 *p++ = '\\';
7915 *p++ = 'r';
7916 }
7917
7918 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007919 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007920 *p++ = '\\';
7921 *p++ = 'x';
7922 *p++ = hexdigits[(ch >> 4) & 0x000F];
7923 *p++ = hexdigits[ch & 0x000F];
7924 }
7925
Georg Brandl559e5d72008-06-11 18:37:52 +00007926 /* Copy ASCII characters as-is */
7927 else if (ch < 0x7F) {
7928 *p++ = ch;
7929 }
7930
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007932 else {
7933 Py_UCS4 ucs = ch;
7934
7935#ifndef Py_UNICODE_WIDE
7936 Py_UNICODE ch2 = 0;
7937 /* Get code point from surrogate pair */
7938 if (size > 0) {
7939 ch2 = *s;
7940 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007942 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007944 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007945 size--;
7946 }
7947 }
7948#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007949 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007950 (categories Z* and C* except ASCII space)
7951 */
7952 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7953 /* Map 8-bit characters to '\xhh' */
7954 if (ucs <= 0xff) {
7955 *p++ = '\\';
7956 *p++ = 'x';
7957 *p++ = hexdigits[(ch >> 4) & 0x000F];
7958 *p++ = hexdigits[ch & 0x000F];
7959 }
7960 /* Map 21-bit characters to '\U00xxxxxx' */
7961 else if (ucs >= 0x10000) {
7962 *p++ = '\\';
7963 *p++ = 'U';
7964 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7965 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7966 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7967 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7968 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7969 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7970 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7971 *p++ = hexdigits[ucs & 0x0000000F];
7972 }
7973 /* Map 16-bit characters to '\uxxxx' */
7974 else {
7975 *p++ = '\\';
7976 *p++ = 'u';
7977 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7978 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7979 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7980 *p++ = hexdigits[ucs & 0x000F];
7981 }
7982 }
7983 /* Copy characters as-is */
7984 else {
7985 *p++ = ch;
7986#ifndef Py_UNICODE_WIDE
7987 if (ucs >= 0x10000)
7988 *p++ = ch2;
7989#endif
7990 }
7991 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007992 }
7993 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007994 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007995
7996 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00007997 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007998 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999}
8000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008001PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003\n\
8004Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008005such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006arguments start and end are interpreted as in slice notation.\n\
8007\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008008Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009
8010static PyObject *
8011unicode_rfind(PyUnicodeObject *self, PyObject *args)
8012{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008013 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008014 Py_ssize_t start;
8015 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008016 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017
Christian Heimes9cd17752007-11-18 19:35:23 +00008018 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020
Thomas Wouters477c8d52006-05-27 19:21:47 +00008021 result = stringlib_rfind_slice(
8022 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8023 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8024 start, end
8025 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026
8027 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008028
Christian Heimes217cfd12007-12-02 14:31:20 +00008029 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030}
8031
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008032PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008035Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036
8037static PyObject *
8038unicode_rindex(PyUnicodeObject *self, PyObject *args)
8039{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008040 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008041 Py_ssize_t start;
8042 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008043 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044
Christian Heimes9cd17752007-11-18 19:35:23 +00008045 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047
Thomas Wouters477c8d52006-05-27 19:21:47 +00008048 result = stringlib_rfind_slice(
8049 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8050 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8051 start, end
8052 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053
8054 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008055
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 if (result < 0) {
8057 PyErr_SetString(PyExc_ValueError, "substring not found");
8058 return NULL;
8059 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008060 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061}
8062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008063PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008066Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008067done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068
8069static PyObject *
8070unicode_rjust(PyUnicodeObject *self, PyObject *args)
8071{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008072 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008073 Py_UNICODE fillchar = ' ';
8074
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008075 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 return NULL;
8077
Tim Peters7a29bd52001-09-12 03:03:31 +00008078 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 Py_INCREF(self);
8080 return (PyObject*) self;
8081 }
8082
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008083 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084}
8085
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 PyObject *sep,
8088 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089{
8090 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008091
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 s = PyUnicode_FromObject(s);
8093 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008094 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 if (sep != NULL) {
8096 sep = PyUnicode_FromObject(sep);
8097 if (sep == NULL) {
8098 Py_DECREF(s);
8099 return NULL;
8100 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 }
8102
8103 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8104
8105 Py_DECREF(s);
8106 Py_XDECREF(sep);
8107 return result;
8108}
8109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008110PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112\n\
8113Return a list of the words in S, using sep as the\n\
8114delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008115splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008116whitespace string is a separator and empty strings are\n\
8117removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
8119static PyObject*
8120unicode_split(PyUnicodeObject *self, PyObject *args)
8121{
8122 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008123 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124
Martin v. Löwis18e16552006-02-15 17:27:45 +00008125 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 return NULL;
8127
8128 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134}
8135
Thomas Wouters477c8d52006-05-27 19:21:47 +00008136PyObject *
8137PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8138{
8139 PyObject* str_obj;
8140 PyObject* sep_obj;
8141 PyObject* out;
8142
8143 str_obj = PyUnicode_FromObject(str_in);
8144 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008146 sep_obj = PyUnicode_FromObject(sep_in);
8147 if (!sep_obj) {
8148 Py_DECREF(str_obj);
8149 return NULL;
8150 }
8151
8152 out = stringlib_partition(
8153 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8154 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8155 );
8156
8157 Py_DECREF(sep_obj);
8158 Py_DECREF(str_obj);
8159
8160 return out;
8161}
8162
8163
8164PyObject *
8165PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8166{
8167 PyObject* str_obj;
8168 PyObject* sep_obj;
8169 PyObject* out;
8170
8171 str_obj = PyUnicode_FromObject(str_in);
8172 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008174 sep_obj = PyUnicode_FromObject(sep_in);
8175 if (!sep_obj) {
8176 Py_DECREF(str_obj);
8177 return NULL;
8178 }
8179
8180 out = stringlib_rpartition(
8181 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8182 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8183 );
8184
8185 Py_DECREF(sep_obj);
8186 Py_DECREF(str_obj);
8187
8188 return out;
8189}
8190
8191PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008193\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008194Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008195the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008196found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008197
8198static PyObject*
8199unicode_partition(PyUnicodeObject *self, PyObject *separator)
8200{
8201 return PyUnicode_Partition((PyObject *)self, separator);
8202}
8203
8204PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008205 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008206\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008207Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008208the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008209separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008210
8211static PyObject*
8212unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8213{
8214 return PyUnicode_RPartition((PyObject *)self, separator);
8215}
8216
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008217PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 PyObject *sep,
8219 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008220{
8221 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008223 s = PyUnicode_FromObject(s);
8224 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008225 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 if (sep != NULL) {
8227 sep = PyUnicode_FromObject(sep);
8228 if (sep == NULL) {
8229 Py_DECREF(s);
8230 return NULL;
8231 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008232 }
8233
8234 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8235
8236 Py_DECREF(s);
8237 Py_XDECREF(sep);
8238 return result;
8239}
8240
8241PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008243\n\
8244Return a list of the words in S, using sep as the\n\
8245delimiter string, starting at the end of the string and\n\
8246working to the front. If maxsplit is given, at most maxsplit\n\
8247splits are done. If sep is not specified, any whitespace string\n\
8248is a separator.");
8249
8250static PyObject*
8251unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8252{
8253 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008254 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008255
Martin v. Löwis18e16552006-02-15 17:27:45 +00008256 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008257 return NULL;
8258
8259 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008261 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008263 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008265}
8266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008267PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269\n\
8270Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008271Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008272is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273
8274static PyObject*
8275unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8276{
Guido van Rossum86662912000-04-11 15:38:46 +00008277 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278
Guido van Rossum86662912000-04-11 15:38:46 +00008279 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 return NULL;
8281
Guido van Rossum86662912000-04-11 15:38:46 +00008282 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283}
8284
8285static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008286PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287{
Walter Dörwald346737f2007-05-31 10:44:43 +00008288 if (PyUnicode_CheckExact(self)) {
8289 Py_INCREF(self);
8290 return self;
8291 } else
8292 /* Subtype -- return genuine unicode string with the same value. */
8293 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8294 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295}
8296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008297PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299\n\
8300Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008301and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302
8303static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008304unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 return fixup(self, fixswapcase);
8307}
8308
Georg Brandlceee0772007-11-27 23:48:05 +00008309PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008311\n\
8312Return a translation table usable for str.translate().\n\
8313If there is only one argument, it must be a dictionary mapping Unicode\n\
8314ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008315Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008316If there are two arguments, they must be strings of equal length, and\n\
8317in the resulting dictionary, each character in x will be mapped to the\n\
8318character at the same position in y. If there is a third argument, it\n\
8319must be a string, whose characters will be mapped to None in the result.");
8320
8321static PyObject*
8322unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8323{
8324 PyObject *x, *y = NULL, *z = NULL;
8325 PyObject *new = NULL, *key, *value;
8326 Py_ssize_t i = 0;
8327 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008328
Georg Brandlceee0772007-11-27 23:48:05 +00008329 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8330 return NULL;
8331 new = PyDict_New();
8332 if (!new)
8333 return NULL;
8334 if (y != NULL) {
8335 /* x must be a string too, of equal length */
8336 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8337 if (!PyUnicode_Check(x)) {
8338 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8339 "be a string if there is a second argument");
8340 goto err;
8341 }
8342 if (PyUnicode_GET_SIZE(x) != ylen) {
8343 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8344 "arguments must have equal length");
8345 goto err;
8346 }
8347 /* create entries for translating chars in x to those in y */
8348 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008349 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8350 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008351 if (!key || !value)
8352 goto err;
8353 res = PyDict_SetItem(new, key, value);
8354 Py_DECREF(key);
8355 Py_DECREF(value);
8356 if (res < 0)
8357 goto err;
8358 }
8359 /* create entries for deleting chars in z */
8360 if (z != NULL) {
8361 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008362 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008363 if (!key)
8364 goto err;
8365 res = PyDict_SetItem(new, key, Py_None);
8366 Py_DECREF(key);
8367 if (res < 0)
8368 goto err;
8369 }
8370 }
8371 } else {
8372 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008373 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008374 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8375 "to maketrans it must be a dict");
8376 goto err;
8377 }
8378 /* copy entries into the new dict, converting string keys to int keys */
8379 while (PyDict_Next(x, &i, &key, &value)) {
8380 if (PyUnicode_Check(key)) {
8381 /* convert string keys to integer keys */
8382 PyObject *newkey;
8383 if (PyUnicode_GET_SIZE(key) != 1) {
8384 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8385 "table must be of length 1");
8386 goto err;
8387 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008388 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008389 if (!newkey)
8390 goto err;
8391 res = PyDict_SetItem(new, newkey, value);
8392 Py_DECREF(newkey);
8393 if (res < 0)
8394 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008395 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008396 /* just keep integer keys */
8397 if (PyDict_SetItem(new, key, value) < 0)
8398 goto err;
8399 } else {
8400 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8401 "be strings or integers");
8402 goto err;
8403 }
8404 }
8405 }
8406 return new;
8407 err:
8408 Py_DECREF(new);
8409 return NULL;
8410}
8411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008412PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414\n\
8415Return a copy of the string S, where all characters have been mapped\n\
8416through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008417Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008418Unmapped characters are left untouched. Characters mapped to None\n\
8419are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420
8421static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008422unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423{
Georg Brandlceee0772007-11-27 23:48:05 +00008424 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425}
8426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008427PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008430Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431
8432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008433unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 return fixup(self, fixupper);
8436}
8437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008438PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008441Pad a numeric string S with zeros on the left, to fill a field\n\
8442of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443
8444static PyObject *
8445unicode_zfill(PyUnicodeObject *self, PyObject *args)
8446{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008447 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 PyUnicodeObject *u;
8449
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 Py_ssize_t width;
8451 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 return NULL;
8453
8454 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008455 if (PyUnicode_CheckExact(self)) {
8456 Py_INCREF(self);
8457 return (PyObject*) self;
8458 }
8459 else
8460 return PyUnicode_FromUnicode(
8461 PyUnicode_AS_UNICODE(self),
8462 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 }
8465
8466 fill = width - self->length;
8467
8468 u = pad(self, fill, 0, '0');
8469
Walter Dörwald068325e2002-04-15 13:36:47 +00008470 if (u == NULL)
8471 return NULL;
8472
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 if (u->str[fill] == '+' || u->str[fill] == '-') {
8474 /* move sign to beginning of string */
8475 u->str[0] = u->str[fill];
8476 u->str[fill] = '0';
8477 }
8478
8479 return (PyObject*) u;
8480}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481
8482#if 0
8483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008484unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485{
Christian Heimes2202f872008-02-06 14:31:34 +00008486 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487}
8488#endif
8489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008490PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008493Return True if S starts with the specified prefix, False otherwise.\n\
8494With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008495With optional end, stop comparing S at that position.\n\
8496prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497
8498static PyObject *
8499unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008502 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008504 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008505 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008506 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008508 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8510 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008511 if (PyTuple_Check(subobj)) {
8512 Py_ssize_t i;
8513 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8514 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008516 if (substring == NULL)
8517 return NULL;
8518 result = tailmatch(self, substring, start, end, -1);
8519 Py_DECREF(substring);
8520 if (result) {
8521 Py_RETURN_TRUE;
8522 }
8523 }
8524 /* nothing matched */
8525 Py_RETURN_FALSE;
8526 }
8527 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008530 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008532 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533}
8534
8535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008536PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008539Return True if S ends with the specified suffix, False otherwise.\n\
8540With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008541With optional end, stop comparing S at that position.\n\
8542suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543
8544static PyObject *
8545unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008548 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008550 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008551 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008552 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008554 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8556 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008557 if (PyTuple_Check(subobj)) {
8558 Py_ssize_t i;
8559 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8560 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008562 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008564 result = tailmatch(self, substring, start, end, +1);
8565 Py_DECREF(substring);
8566 if (result) {
8567 Py_RETURN_TRUE;
8568 }
8569 }
8570 Py_RETURN_FALSE;
8571 }
8572 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008576 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008578 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579}
8580
Eric Smith8c663262007-08-25 02:26:07 +00008581#include "stringlib/string_format.h"
8582
8583PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008585\n\
8586");
8587
Eric Smith4a7d76d2008-05-30 18:10:19 +00008588static PyObject *
8589unicode__format__(PyObject* self, PyObject* args)
8590{
8591 PyObject *format_spec;
8592
8593 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8594 return NULL;
8595
8596 return _PyUnicode_FormatAdvanced(self,
8597 PyUnicode_AS_UNICODE(format_spec),
8598 PyUnicode_GET_SIZE(format_spec));
8599}
8600
Eric Smith8c663262007-08-25 02:26:07 +00008601PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008603\n\
8604");
8605
8606static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008607unicode__sizeof__(PyUnicodeObject *v)
8608{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008609 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8610 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008611}
8612
8613PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008615
8616static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008617unicode_getnewargs(PyUnicodeObject *v)
8618{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008619 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008620}
8621
8622
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623static PyMethodDef unicode_methods[] = {
8624
8625 /* Order is according to common usage: often used methods should
8626 appear first, since lookup is done sequentially. */
8627
Benjamin Peterson308d6372009-09-18 21:42:35 +00008628 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008629 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8630 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008631 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008632 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8633 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8634 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8635 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8636 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8637 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8638 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008639 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008640 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8641 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8642 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008643 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008644 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8645 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8646 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008647 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008648 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008649 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008650 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008651 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8652 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8653 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8654 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8655 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8656 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8657 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8658 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8659 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8660 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8661 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8662 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8663 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8664 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008665 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008666 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008667 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008668 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008669 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008670 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8671 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008672 {"maketrans", (PyCFunction) unicode_maketrans,
8673 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008674 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008675#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008676 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677#endif
8678
8679#if 0
8680 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008681 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682#endif
8683
Benjamin Peterson14339b62009-01-31 16:36:08 +00008684 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 {NULL, NULL}
8686};
8687
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008688static PyObject *
8689unicode_mod(PyObject *v, PyObject *w)
8690{
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 if (!PyUnicode_Check(v)) {
8692 Py_INCREF(Py_NotImplemented);
8693 return Py_NotImplemented;
8694 }
8695 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008696}
8697
8698static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008699 0, /*nb_add*/
8700 0, /*nb_subtract*/
8701 0, /*nb_multiply*/
8702 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008703};
8704
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008706 (lenfunc) unicode_length, /* sq_length */
8707 PyUnicode_Concat, /* sq_concat */
8708 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8709 (ssizeargfunc) unicode_getitem, /* sq_item */
8710 0, /* sq_slice */
8711 0, /* sq_ass_item */
8712 0, /* sq_ass_slice */
8713 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714};
8715
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008716static PyObject*
8717unicode_subscript(PyUnicodeObject* self, PyObject* item)
8718{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008719 if (PyIndex_Check(item)) {
8720 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008721 if (i == -1 && PyErr_Occurred())
8722 return NULL;
8723 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008724 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008725 return unicode_getitem(self, i);
8726 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008727 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008728 Py_UNICODE* source_buf;
8729 Py_UNICODE* result_buf;
8730 PyObject* result;
8731
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008732 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008734 return NULL;
8735 }
8736
8737 if (slicelength <= 0) {
8738 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008739 } else if (start == 0 && step == 1 && slicelength == self->length &&
8740 PyUnicode_CheckExact(self)) {
8741 Py_INCREF(self);
8742 return (PyObject *)self;
8743 } else if (step == 1) {
8744 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008745 } else {
8746 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008747 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8748 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008749
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 if (result_buf == NULL)
8751 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008752
8753 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8754 result_buf[i] = source_buf[cur];
8755 }
Tim Petersced69f82003-09-16 20:30:58 +00008756
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008757 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008758 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008759 return result;
8760 }
8761 } else {
8762 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8763 return NULL;
8764 }
8765}
8766
8767static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008768 (lenfunc)unicode_length, /* mp_length */
8769 (binaryfunc)unicode_subscript, /* mp_subscript */
8770 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008771};
8772
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774/* Helpers for PyUnicode_Format() */
8775
8776static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008777getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008779 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 (*p_argidx)++;
8782 if (arglen < 0)
8783 return args;
8784 else
8785 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 }
8787 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 return NULL;
8790}
8791
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008792/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008794static PyObject *
8795formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008797 char *p;
8798 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008800
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 x = PyFloat_AsDouble(v);
8802 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008803 return NULL;
8804
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008807
Eric Smith0923d1d2009-04-16 20:16:10 +00008808 p = PyOS_double_to_string(x, type, prec,
8809 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008810 if (p == NULL)
8811 return NULL;
8812 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008813 PyMem_Free(p);
8814 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008815}
8816
Tim Peters38fd5b62000-09-21 05:43:11 +00008817static PyObject*
8818formatlong(PyObject *val, int flags, int prec, int type)
8819{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008820 char *buf;
8821 int len;
8822 PyObject *str; /* temporary string object. */
8823 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008824
Benjamin Peterson14339b62009-01-31 16:36:08 +00008825 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8826 if (!str)
8827 return NULL;
8828 result = PyUnicode_FromStringAndSize(buf, len);
8829 Py_DECREF(str);
8830 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008831}
8832
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833static int
8834formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008835 size_t buflen,
8836 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008838 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008839 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 if (PyUnicode_GET_SIZE(v) == 1) {
8841 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8842 buf[1] = '\0';
8843 return 1;
8844 }
8845#ifndef Py_UNICODE_WIDE
8846 if (PyUnicode_GET_SIZE(v) == 2) {
8847 /* Decode a valid surrogate pair */
8848 int c0 = PyUnicode_AS_UNICODE(v)[0];
8849 int c1 = PyUnicode_AS_UNICODE(v)[1];
8850 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8851 0xDC00 <= c1 && c1 <= 0xDFFF) {
8852 buf[0] = c0;
8853 buf[1] = c1;
8854 buf[2] = '\0';
8855 return 2;
8856 }
8857 }
8858#endif
8859 goto onError;
8860 }
8861 else {
8862 /* Integer input truncated to a character */
8863 long x;
8864 x = PyLong_AsLong(v);
8865 if (x == -1 && PyErr_Occurred())
8866 goto onError;
8867
8868 if (x < 0 || x > 0x10ffff) {
8869 PyErr_SetString(PyExc_OverflowError,
8870 "%c arg not in range(0x110000)");
8871 return -1;
8872 }
8873
8874#ifndef Py_UNICODE_WIDE
8875 if (x > 0xffff) {
8876 x -= 0x10000;
8877 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8878 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8879 return 2;
8880 }
8881#endif
8882 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008883 buf[1] = '\0';
8884 return 1;
8885 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008886
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008888 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008890 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891}
8892
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008893/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008894 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008895*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008896#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008897
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900{
8901 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008902 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 int args_owned = 0;
8904 PyUnicodeObject *result = NULL;
8905 PyObject *dict = NULL;
8906 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008907
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 PyErr_BadInternalCall();
8910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911 }
8912 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008913 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915 fmt = PyUnicode_AS_UNICODE(uformat);
8916 fmtcnt = PyUnicode_GET_SIZE(uformat);
8917
8918 reslen = rescnt = fmtcnt + 100;
8919 result = _PyUnicode_New(reslen);
8920 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922 res = PyUnicode_AS_UNICODE(result);
8923
8924 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 arglen = PyTuple_Size(args);
8926 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927 }
8928 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 arglen = -1;
8930 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008932 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008933 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935
8936 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 if (*fmt != '%') {
8938 if (--rescnt < 0) {
8939 rescnt = fmtcnt + 100;
8940 reslen += rescnt;
8941 if (_PyUnicode_Resize(&result, reslen) < 0)
8942 goto onError;
8943 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8944 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008945 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008947 }
8948 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 /* Got a format specifier */
8950 int flags = 0;
8951 Py_ssize_t width = -1;
8952 int prec = -1;
8953 Py_UNICODE c = '\0';
8954 Py_UNICODE fill;
8955 int isnumok;
8956 PyObject *v = NULL;
8957 PyObject *temp = NULL;
8958 Py_UNICODE *pbuf;
8959 Py_UNICODE sign;
8960 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008961 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 fmt++;
8964 if (*fmt == '(') {
8965 Py_UNICODE *keystart;
8966 Py_ssize_t keylen;
8967 PyObject *key;
8968 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00008969
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 if (dict == NULL) {
8971 PyErr_SetString(PyExc_TypeError,
8972 "format requires a mapping");
8973 goto onError;
8974 }
8975 ++fmt;
8976 --fmtcnt;
8977 keystart = fmt;
8978 /* Skip over balanced parentheses */
8979 while (pcount > 0 && --fmtcnt >= 0) {
8980 if (*fmt == ')')
8981 --pcount;
8982 else if (*fmt == '(')
8983 ++pcount;
8984 fmt++;
8985 }
8986 keylen = fmt - keystart - 1;
8987 if (fmtcnt < 0 || pcount > 0) {
8988 PyErr_SetString(PyExc_ValueError,
8989 "incomplete format key");
8990 goto onError;
8991 }
8992#if 0
8993 /* keys are converted to strings using UTF-8 and
8994 then looked up since Python uses strings to hold
8995 variables names etc. in its namespaces and we
8996 wouldn't want to break common idioms. */
8997 key = PyUnicode_EncodeUTF8(keystart,
8998 keylen,
8999 NULL);
9000#else
9001 key = PyUnicode_FromUnicode(keystart, keylen);
9002#endif
9003 if (key == NULL)
9004 goto onError;
9005 if (args_owned) {
9006 Py_DECREF(args);
9007 args_owned = 0;
9008 }
9009 args = PyObject_GetItem(dict, key);
9010 Py_DECREF(key);
9011 if (args == NULL) {
9012 goto onError;
9013 }
9014 args_owned = 1;
9015 arglen = -1;
9016 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009017 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009018 while (--fmtcnt >= 0) {
9019 switch (c = *fmt++) {
9020 case '-': flags |= F_LJUST; continue;
9021 case '+': flags |= F_SIGN; continue;
9022 case ' ': flags |= F_BLANK; continue;
9023 case '#': flags |= F_ALT; continue;
9024 case '0': flags |= F_ZERO; continue;
9025 }
9026 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009027 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 if (c == '*') {
9029 v = getnextarg(args, arglen, &argidx);
9030 if (v == NULL)
9031 goto onError;
9032 if (!PyLong_Check(v)) {
9033 PyErr_SetString(PyExc_TypeError,
9034 "* wants int");
9035 goto onError;
9036 }
9037 width = PyLong_AsLong(v);
9038 if (width == -1 && PyErr_Occurred())
9039 goto onError;
9040 if (width < 0) {
9041 flags |= F_LJUST;
9042 width = -width;
9043 }
9044 if (--fmtcnt >= 0)
9045 c = *fmt++;
9046 }
9047 else if (c >= '0' && c <= '9') {
9048 width = c - '0';
9049 while (--fmtcnt >= 0) {
9050 c = *fmt++;
9051 if (c < '0' || c > '9')
9052 break;
9053 if ((width*10) / 10 != width) {
9054 PyErr_SetString(PyExc_ValueError,
9055 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009056 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 }
9058 width = width*10 + (c - '0');
9059 }
9060 }
9061 if (c == '.') {
9062 prec = 0;
9063 if (--fmtcnt >= 0)
9064 c = *fmt++;
9065 if (c == '*') {
9066 v = getnextarg(args, arglen, &argidx);
9067 if (v == NULL)
9068 goto onError;
9069 if (!PyLong_Check(v)) {
9070 PyErr_SetString(PyExc_TypeError,
9071 "* wants int");
9072 goto onError;
9073 }
9074 prec = PyLong_AsLong(v);
9075 if (prec == -1 && PyErr_Occurred())
9076 goto onError;
9077 if (prec < 0)
9078 prec = 0;
9079 if (--fmtcnt >= 0)
9080 c = *fmt++;
9081 }
9082 else if (c >= '0' && c <= '9') {
9083 prec = c - '0';
9084 while (--fmtcnt >= 0) {
9085 c = Py_CHARMASK(*fmt++);
9086 if (c < '0' || c > '9')
9087 break;
9088 if ((prec*10) / 10 != prec) {
9089 PyErr_SetString(PyExc_ValueError,
9090 "prec too big");
9091 goto onError;
9092 }
9093 prec = prec*10 + (c - '0');
9094 }
9095 }
9096 } /* prec */
9097 if (fmtcnt >= 0) {
9098 if (c == 'h' || c == 'l' || c == 'L') {
9099 if (--fmtcnt >= 0)
9100 c = *fmt++;
9101 }
9102 }
9103 if (fmtcnt < 0) {
9104 PyErr_SetString(PyExc_ValueError,
9105 "incomplete format");
9106 goto onError;
9107 }
9108 if (c != '%') {
9109 v = getnextarg(args, arglen, &argidx);
9110 if (v == NULL)
9111 goto onError;
9112 }
9113 sign = 0;
9114 fill = ' ';
9115 switch (c) {
9116
9117 case '%':
9118 pbuf = formatbuf;
9119 /* presume that buffer length is at least 1 */
9120 pbuf[0] = '%';
9121 len = 1;
9122 break;
9123
9124 case 's':
9125 case 'r':
9126 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009127 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 temp = v;
9129 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009130 }
9131 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 if (c == 's')
9133 temp = PyObject_Str(v);
9134 else if (c == 'r')
9135 temp = PyObject_Repr(v);
9136 else
9137 temp = PyObject_ASCII(v);
9138 if (temp == NULL)
9139 goto onError;
9140 if (PyUnicode_Check(temp))
9141 /* nothing to do */;
9142 else {
9143 Py_DECREF(temp);
9144 PyErr_SetString(PyExc_TypeError,
9145 "%s argument has non-string str()");
9146 goto onError;
9147 }
9148 }
9149 pbuf = PyUnicode_AS_UNICODE(temp);
9150 len = PyUnicode_GET_SIZE(temp);
9151 if (prec >= 0 && len > prec)
9152 len = prec;
9153 break;
9154
9155 case 'i':
9156 case 'd':
9157 case 'u':
9158 case 'o':
9159 case 'x':
9160 case 'X':
9161 if (c == 'i')
9162 c = 'd';
9163 isnumok = 0;
9164 if (PyNumber_Check(v)) {
9165 PyObject *iobj=NULL;
9166
9167 if (PyLong_Check(v)) {
9168 iobj = v;
9169 Py_INCREF(iobj);
9170 }
9171 else {
9172 iobj = PyNumber_Long(v);
9173 }
9174 if (iobj!=NULL) {
9175 if (PyLong_Check(iobj)) {
9176 isnumok = 1;
9177 temp = formatlong(iobj, flags, prec, c);
9178 Py_DECREF(iobj);
9179 if (!temp)
9180 goto onError;
9181 pbuf = PyUnicode_AS_UNICODE(temp);
9182 len = PyUnicode_GET_SIZE(temp);
9183 sign = 1;
9184 }
9185 else {
9186 Py_DECREF(iobj);
9187 }
9188 }
9189 }
9190 if (!isnumok) {
9191 PyErr_Format(PyExc_TypeError,
9192 "%%%c format: a number is required, "
9193 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9194 goto onError;
9195 }
9196 if (flags & F_ZERO)
9197 fill = '0';
9198 break;
9199
9200 case 'e':
9201 case 'E':
9202 case 'f':
9203 case 'F':
9204 case 'g':
9205 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009206 temp = formatfloat(v, flags, prec, c);
9207 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009208 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009209 pbuf = PyUnicode_AS_UNICODE(temp);
9210 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009211 sign = 1;
9212 if (flags & F_ZERO)
9213 fill = '0';
9214 break;
9215
9216 case 'c':
9217 pbuf = formatbuf;
9218 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9219 if (len < 0)
9220 goto onError;
9221 break;
9222
9223 default:
9224 PyErr_Format(PyExc_ValueError,
9225 "unsupported format character '%c' (0x%x) "
9226 "at index %zd",
9227 (31<=c && c<=126) ? (char)c : '?',
9228 (int)c,
9229 (Py_ssize_t)(fmt - 1 -
9230 PyUnicode_AS_UNICODE(uformat)));
9231 goto onError;
9232 }
9233 if (sign) {
9234 if (*pbuf == '-' || *pbuf == '+') {
9235 sign = *pbuf++;
9236 len--;
9237 }
9238 else if (flags & F_SIGN)
9239 sign = '+';
9240 else if (flags & F_BLANK)
9241 sign = ' ';
9242 else
9243 sign = 0;
9244 }
9245 if (width < len)
9246 width = len;
9247 if (rescnt - (sign != 0) < width) {
9248 reslen -= rescnt;
9249 rescnt = width + fmtcnt + 100;
9250 reslen += rescnt;
9251 if (reslen < 0) {
9252 Py_XDECREF(temp);
9253 PyErr_NoMemory();
9254 goto onError;
9255 }
9256 if (_PyUnicode_Resize(&result, reslen) < 0) {
9257 Py_XDECREF(temp);
9258 goto onError;
9259 }
9260 res = PyUnicode_AS_UNICODE(result)
9261 + reslen - rescnt;
9262 }
9263 if (sign) {
9264 if (fill != ' ')
9265 *res++ = sign;
9266 rescnt--;
9267 if (width > len)
9268 width--;
9269 }
9270 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9271 assert(pbuf[0] == '0');
9272 assert(pbuf[1] == c);
9273 if (fill != ' ') {
9274 *res++ = *pbuf++;
9275 *res++ = *pbuf++;
9276 }
9277 rescnt -= 2;
9278 width -= 2;
9279 if (width < 0)
9280 width = 0;
9281 len -= 2;
9282 }
9283 if (width > len && !(flags & F_LJUST)) {
9284 do {
9285 --rescnt;
9286 *res++ = fill;
9287 } while (--width > len);
9288 }
9289 if (fill == ' ') {
9290 if (sign)
9291 *res++ = sign;
9292 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9293 assert(pbuf[0] == '0');
9294 assert(pbuf[1] == c);
9295 *res++ = *pbuf++;
9296 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009297 }
9298 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009299 Py_UNICODE_COPY(res, pbuf, len);
9300 res += len;
9301 rescnt -= len;
9302 while (--width >= len) {
9303 --rescnt;
9304 *res++ = ' ';
9305 }
9306 if (dict && (argidx < arglen) && c != '%') {
9307 PyErr_SetString(PyExc_TypeError,
9308 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009309 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 goto onError;
9311 }
9312 Py_XDECREF(temp);
9313 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314 } /* until end */
9315 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 PyErr_SetString(PyExc_TypeError,
9317 "not all arguments converted during string formatting");
9318 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319 }
9320
Thomas Woutersa96affe2006-03-12 00:29:36 +00009321 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009324 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325 }
9326 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327 return (PyObject *)result;
9328
Benjamin Peterson29060642009-01-31 22:14:21 +00009329 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330 Py_XDECREF(result);
9331 Py_DECREF(uformat);
9332 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009333 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334 }
9335 return NULL;
9336}
9337
Jeremy Hylton938ace62002-07-17 16:30:39 +00009338static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009339unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9340
Tim Peters6d6c1a32001-08-02 04:15:00 +00009341static PyObject *
9342unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9343{
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009345 static char *kwlist[] = {"object", "encoding", "errors", 0};
9346 char *encoding = NULL;
9347 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009348
Benjamin Peterson14339b62009-01-31 16:36:08 +00009349 if (type != &PyUnicode_Type)
9350 return unicode_subtype_new(type, args, kwds);
9351 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009353 return NULL;
9354 if (x == NULL)
9355 return (PyObject *)_PyUnicode_New(0);
9356 if (encoding == NULL && errors == NULL)
9357 return PyObject_Str(x);
9358 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009360}
9361
Guido van Rossume023fe02001-08-30 03:12:59 +00009362static PyObject *
9363unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9364{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009365 PyUnicodeObject *tmp, *pnew;
9366 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009367
Benjamin Peterson14339b62009-01-31 16:36:08 +00009368 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9369 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9370 if (tmp == NULL)
9371 return NULL;
9372 assert(PyUnicode_Check(tmp));
9373 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9374 if (pnew == NULL) {
9375 Py_DECREF(tmp);
9376 return NULL;
9377 }
9378 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9379 if (pnew->str == NULL) {
9380 _Py_ForgetReference((PyObject *)pnew);
9381 PyObject_Del(pnew);
9382 Py_DECREF(tmp);
9383 return PyErr_NoMemory();
9384 }
9385 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9386 pnew->length = n;
9387 pnew->hash = tmp->hash;
9388 Py_DECREF(tmp);
9389 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009390}
9391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009392PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009394\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009395Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009396encoding defaults to the current default string encoding.\n\
9397errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009398
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009399static PyObject *unicode_iter(PyObject *seq);
9400
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009402 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009403 "str", /* tp_name */
9404 sizeof(PyUnicodeObject), /* tp_size */
9405 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009407 (destructor)unicode_dealloc, /* tp_dealloc */
9408 0, /* tp_print */
9409 0, /* tp_getattr */
9410 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009411 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009412 unicode_repr, /* tp_repr */
9413 &unicode_as_number, /* tp_as_number */
9414 &unicode_as_sequence, /* tp_as_sequence */
9415 &unicode_as_mapping, /* tp_as_mapping */
9416 (hashfunc) unicode_hash, /* tp_hash*/
9417 0, /* tp_call*/
9418 (reprfunc) unicode_str, /* tp_str */
9419 PyObject_GenericGetAttr, /* tp_getattro */
9420 0, /* tp_setattro */
9421 0, /* tp_as_buffer */
9422 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009423 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009424 unicode_doc, /* tp_doc */
9425 0, /* tp_traverse */
9426 0, /* tp_clear */
9427 PyUnicode_RichCompare, /* tp_richcompare */
9428 0, /* tp_weaklistoffset */
9429 unicode_iter, /* tp_iter */
9430 0, /* tp_iternext */
9431 unicode_methods, /* tp_methods */
9432 0, /* tp_members */
9433 0, /* tp_getset */
9434 &PyBaseObject_Type, /* tp_base */
9435 0, /* tp_dict */
9436 0, /* tp_descr_get */
9437 0, /* tp_descr_set */
9438 0, /* tp_dictoffset */
9439 0, /* tp_init */
9440 0, /* tp_alloc */
9441 unicode_new, /* tp_new */
9442 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443};
9444
9445/* Initialize the Unicode implementation */
9446
Thomas Wouters78890102000-07-22 19:25:51 +00009447void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009449 int i;
9450
Thomas Wouters477c8d52006-05-27 19:21:47 +00009451 /* XXX - move this array to unicodectype.c ? */
9452 Py_UNICODE linebreak[] = {
9453 0x000A, /* LINE FEED */
9454 0x000D, /* CARRIAGE RETURN */
9455 0x001C, /* FILE SEPARATOR */
9456 0x001D, /* GROUP SEPARATOR */
9457 0x001E, /* RECORD SEPARATOR */
9458 0x0085, /* NEXT LINE */
9459 0x2028, /* LINE SEPARATOR */
9460 0x2029, /* PARAGRAPH SEPARATOR */
9461 };
9462
Fred Drakee4315f52000-05-09 19:53:39 +00009463 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009464 free_list = NULL;
9465 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009467 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009469
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009470 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009472 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009474
9475 /* initialize the linebreak bloom filter */
9476 bloom_linebreak = make_bloom_mask(
9477 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9478 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009479
9480 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481}
9482
9483/* Finalize the Unicode implementation */
9484
Christian Heimesa156e092008-02-16 07:38:31 +00009485int
9486PyUnicode_ClearFreeList(void)
9487{
9488 int freelist_size = numfree;
9489 PyUnicodeObject *u;
9490
9491 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 PyUnicodeObject *v = u;
9493 u = *(PyUnicodeObject **)u;
9494 if (v->str)
9495 PyObject_DEL(v->str);
9496 Py_XDECREF(v->defenc);
9497 PyObject_Del(v);
9498 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009499 }
9500 free_list = NULL;
9501 assert(numfree == 0);
9502 return freelist_size;
9503}
9504
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505void
Thomas Wouters78890102000-07-22 19:25:51 +00009506_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009508 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009510 Py_XDECREF(unicode_empty);
9511 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009512
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009513 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009514 if (unicode_latin1[i]) {
9515 Py_DECREF(unicode_latin1[i]);
9516 unicode_latin1[i] = NULL;
9517 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009518 }
Christian Heimesa156e092008-02-16 07:38:31 +00009519 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009521
Walter Dörwald16807132007-05-25 13:52:07 +00009522void
9523PyUnicode_InternInPlace(PyObject **p)
9524{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009525 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9526 PyObject *t;
9527 if (s == NULL || !PyUnicode_Check(s))
9528 Py_FatalError(
9529 "PyUnicode_InternInPlace: unicode strings only please!");
9530 /* If it's a subclass, we don't really know what putting
9531 it in the interned dict might do. */
9532 if (!PyUnicode_CheckExact(s))
9533 return;
9534 if (PyUnicode_CHECK_INTERNED(s))
9535 return;
9536 if (interned == NULL) {
9537 interned = PyDict_New();
9538 if (interned == NULL) {
9539 PyErr_Clear(); /* Don't leave an exception */
9540 return;
9541 }
9542 }
9543 /* It might be that the GetItem call fails even
9544 though the key is present in the dictionary,
9545 namely when this happens during a stack overflow. */
9546 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009548 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009549
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 if (t) {
9551 Py_INCREF(t);
9552 Py_DECREF(*p);
9553 *p = t;
9554 return;
9555 }
Walter Dörwald16807132007-05-25 13:52:07 +00009556
Benjamin Peterson14339b62009-01-31 16:36:08 +00009557 PyThreadState_GET()->recursion_critical = 1;
9558 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9559 PyErr_Clear();
9560 PyThreadState_GET()->recursion_critical = 0;
9561 return;
9562 }
9563 PyThreadState_GET()->recursion_critical = 0;
9564 /* The two references in interned are not counted by refcnt.
9565 The deallocator will take care of this */
9566 Py_REFCNT(s) -= 2;
9567 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009568}
9569
9570void
9571PyUnicode_InternImmortal(PyObject **p)
9572{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009573 PyUnicode_InternInPlace(p);
9574 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9575 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9576 Py_INCREF(*p);
9577 }
Walter Dörwald16807132007-05-25 13:52:07 +00009578}
9579
9580PyObject *
9581PyUnicode_InternFromString(const char *cp)
9582{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009583 PyObject *s = PyUnicode_FromString(cp);
9584 if (s == NULL)
9585 return NULL;
9586 PyUnicode_InternInPlace(&s);
9587 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009588}
9589
9590void _Py_ReleaseInternedUnicodeStrings(void)
9591{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009592 PyObject *keys;
9593 PyUnicodeObject *s;
9594 Py_ssize_t i, n;
9595 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009596
Benjamin Peterson14339b62009-01-31 16:36:08 +00009597 if (interned == NULL || !PyDict_Check(interned))
9598 return;
9599 keys = PyDict_Keys(interned);
9600 if (keys == NULL || !PyList_Check(keys)) {
9601 PyErr_Clear();
9602 return;
9603 }
Walter Dörwald16807132007-05-25 13:52:07 +00009604
Benjamin Peterson14339b62009-01-31 16:36:08 +00009605 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9606 detector, interned unicode strings are not forcibly deallocated;
9607 rather, we give them their stolen references back, and then clear
9608 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009609
Benjamin Peterson14339b62009-01-31 16:36:08 +00009610 n = PyList_GET_SIZE(keys);
9611 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009613 for (i = 0; i < n; i++) {
9614 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9615 switch (s->state) {
9616 case SSTATE_NOT_INTERNED:
9617 /* XXX Shouldn't happen */
9618 break;
9619 case SSTATE_INTERNED_IMMORTAL:
9620 Py_REFCNT(s) += 1;
9621 immortal_size += s->length;
9622 break;
9623 case SSTATE_INTERNED_MORTAL:
9624 Py_REFCNT(s) += 2;
9625 mortal_size += s->length;
9626 break;
9627 default:
9628 Py_FatalError("Inconsistent interned string state.");
9629 }
9630 s->state = SSTATE_NOT_INTERNED;
9631 }
9632 fprintf(stderr, "total size of all interned strings: "
9633 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9634 "mortal/immortal\n", mortal_size, immortal_size);
9635 Py_DECREF(keys);
9636 PyDict_Clear(interned);
9637 Py_DECREF(interned);
9638 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009639}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009640
9641
9642/********************* Unicode Iterator **************************/
9643
9644typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009645 PyObject_HEAD
9646 Py_ssize_t it_index;
9647 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009648} unicodeiterobject;
9649
9650static void
9651unicodeiter_dealloc(unicodeiterobject *it)
9652{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009653 _PyObject_GC_UNTRACK(it);
9654 Py_XDECREF(it->it_seq);
9655 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009656}
9657
9658static int
9659unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9660{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009661 Py_VISIT(it->it_seq);
9662 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009663}
9664
9665static PyObject *
9666unicodeiter_next(unicodeiterobject *it)
9667{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009668 PyUnicodeObject *seq;
9669 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009670
Benjamin Peterson14339b62009-01-31 16:36:08 +00009671 assert(it != NULL);
9672 seq = it->it_seq;
9673 if (seq == NULL)
9674 return NULL;
9675 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009676
Benjamin Peterson14339b62009-01-31 16:36:08 +00009677 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9678 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009679 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009680 if (item != NULL)
9681 ++it->it_index;
9682 return item;
9683 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009684
Benjamin Peterson14339b62009-01-31 16:36:08 +00009685 Py_DECREF(seq);
9686 it->it_seq = NULL;
9687 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009688}
9689
9690static PyObject *
9691unicodeiter_len(unicodeiterobject *it)
9692{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009693 Py_ssize_t len = 0;
9694 if (it->it_seq)
9695 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9696 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009697}
9698
9699PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9700
9701static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009702 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009703 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009704 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009705};
9706
9707PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009708 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9709 "str_iterator", /* tp_name */
9710 sizeof(unicodeiterobject), /* tp_basicsize */
9711 0, /* tp_itemsize */
9712 /* methods */
9713 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9714 0, /* tp_print */
9715 0, /* tp_getattr */
9716 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009717 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009718 0, /* tp_repr */
9719 0, /* tp_as_number */
9720 0, /* tp_as_sequence */
9721 0, /* tp_as_mapping */
9722 0, /* tp_hash */
9723 0, /* tp_call */
9724 0, /* tp_str */
9725 PyObject_GenericGetAttr, /* tp_getattro */
9726 0, /* tp_setattro */
9727 0, /* tp_as_buffer */
9728 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9729 0, /* tp_doc */
9730 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9731 0, /* tp_clear */
9732 0, /* tp_richcompare */
9733 0, /* tp_weaklistoffset */
9734 PyObject_SelfIter, /* tp_iter */
9735 (iternextfunc)unicodeiter_next, /* tp_iternext */
9736 unicodeiter_methods, /* tp_methods */
9737 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009738};
9739
9740static PyObject *
9741unicode_iter(PyObject *seq)
9742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009744
Benjamin Peterson14339b62009-01-31 16:36:08 +00009745 if (!PyUnicode_Check(seq)) {
9746 PyErr_BadInternalCall();
9747 return NULL;
9748 }
9749 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9750 if (it == NULL)
9751 return NULL;
9752 it->it_index = 0;
9753 Py_INCREF(seq);
9754 it->it_seq = (PyUnicodeObject *)seq;
9755 _PyObject_GC_TRACK(it);
9756 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009757}
9758
Martin v. Löwis5b222132007-06-10 09:51:05 +00009759size_t
9760Py_UNICODE_strlen(const Py_UNICODE *u)
9761{
9762 int res = 0;
9763 while(*u++)
9764 res++;
9765 return res;
9766}
9767
9768Py_UNICODE*
9769Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9770{
9771 Py_UNICODE *u = s1;
9772 while ((*u++ = *s2++));
9773 return s1;
9774}
9775
9776Py_UNICODE*
9777Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9778{
9779 Py_UNICODE *u = s1;
9780 while ((*u++ = *s2++))
9781 if (n-- == 0)
9782 break;
9783 return s1;
9784}
9785
9786int
9787Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9788{
9789 while (*s1 && *s2 && *s1 == *s2)
9790 s1++, s2++;
9791 if (*s1 && *s2)
9792 return (*s1 < *s2) ? -1 : +1;
9793 if (*s1)
9794 return 1;
9795 if (*s2)
9796 return -1;
9797 return 0;
9798}
9799
9800Py_UNICODE*
9801Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9802{
9803 const Py_UNICODE *p;
9804 for (p = s; *p; p++)
9805 if (*p == c)
9806 return (Py_UNICODE*)p;
9807 return NULL;
9808}
9809
9810
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009811#ifdef __cplusplus
9812}
9813#endif