blob: 35683d0d31e19080b2941d303db89b5703c25d31 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
Antoine Pitrouf068f942010-01-13 14:19:12 +0000209#if LONG_BIT >= 128
210#define BLOOM_WIDTH 128
211#elif LONG_BIT >= 64
212#define BLOOM_WIDTH 64
213#elif LONG_BIT >= 32
214#define BLOOM_WIDTH 32
215#else
216#error "LONG_BIT is smaller than 32"
217#endif
218
Thomas Wouters477c8d52006-05-27 19:21:47 +0000219#define BLOOM_MASK unsigned long
220
221static BLOOM_MASK bloom_linebreak;
222
Antoine Pitrouf068f942010-01-13 14:19:12 +0000223#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
Benjamin Peterson29060642009-01-31 22:14:21 +0000226#define BLOOM_LINEBREAK(ch) \
227 ((ch) < 128U ? ascii_linebreak[(ch)] : \
228 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000229
230Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
231{
232 /* calculate simple bloom-style bitmask for a given unicode string */
233
Antoine Pitrouf068f942010-01-13 14:19:12 +0000234 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235 Py_ssize_t i;
236
237 mask = 0;
238 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000239 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000240
241 return mask;
242}
243
244Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
260static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
302 Py_DECREF(unicode->defenc);
303 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 }
305 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 return 0;
308}
309
310/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000311 Ux0000 terminated; some code (e.g. new_identifier)
312 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000315 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316
317*/
318
319static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000320PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321{
322 register PyUnicodeObject *unicode;
323
Thomas Wouters477c8d52006-05-27 19:21:47 +0000324 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000325 if (length == 0 && unicode_empty != NULL) {
326 Py_INCREF(unicode_empty);
327 return unicode_empty;
328 }
329
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000330 /* Ensure we won't overflow the size. */
331 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
332 return (PyUnicodeObject *)PyErr_NoMemory();
333 }
334
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000336 if (free_list) {
337 unicode = free_list;
338 free_list = *(PyUnicodeObject **)unicode;
339 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 if (unicode->str) {
341 /* Keep-Alive optimization: we only upsize the buffer,
342 never downsize it. */
343 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000344 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 PyObject_DEL(unicode->str);
346 unicode->str = NULL;
347 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000348 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000350 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
351 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000352 }
353 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354 }
355 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000356 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000357 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358 if (unicode == NULL)
359 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
361 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000365 PyErr_NoMemory();
366 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000367 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000368 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000369 * the caller fails before initializing str -- unicode_resize()
370 * reads str[0], and the Keep-Alive optimization can keep memory
371 * allocated for str alive across a call to unicode_dealloc(unicode).
372 * We don't want unicode_resize to read uninitialized memory in
373 * that case.
374 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000375 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000379 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000380 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382
Benjamin Peterson29060642009-01-31 22:14:21 +0000383 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000384 /* XXX UNREF/NEWREF interface should be more symmetrical */
385 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000387 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389}
390
391static
Guido van Rossum9475a232001-10-05 20:51:39 +0000392void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393{
Walter Dörwald16807132007-05-25 13:52:07 +0000394 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_NOT_INTERNED:
396 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000397
Benjamin Peterson29060642009-01-31 22:14:21 +0000398 case SSTATE_INTERNED_MORTAL:
399 /* revive dead object temporarily for DelItem */
400 Py_REFCNT(unicode) = 3;
401 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
402 Py_FatalError(
403 "deletion of interned string failed");
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_IMMORTAL:
407 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000408
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 default:
410 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000411 }
412
Guido van Rossum604ddf82001-12-06 20:03:56 +0000413 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000415 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000416 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
417 PyObject_DEL(unicode->str);
418 unicode->str = NULL;
419 unicode->length = 0;
420 }
421 if (unicode->defenc) {
422 Py_DECREF(unicode->defenc);
423 unicode->defenc = NULL;
424 }
425 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000426 *(PyUnicodeObject **)unicode = free_list;
427 free_list = unicode;
428 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429 }
430 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000431 PyObject_DEL(unicode->str);
432 Py_XDECREF(unicode->defenc);
433 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000434 }
435}
436
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000437static
438int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000439{
440 register PyUnicodeObject *v;
441
442 /* Argument checks */
443 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000444 PyErr_BadInternalCall();
445 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000447 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000448 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000449 PyErr_BadInternalCall();
450 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000451 }
452
453 /* Resizing unicode_empty and single character objects is not
454 possible since these are being shared. We simply return a fresh
455 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000456 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 (v == unicode_empty || v->length == 1)) {
458 PyUnicodeObject *w = _PyUnicode_New(length);
459 if (w == NULL)
460 return -1;
461 Py_UNICODE_COPY(w->str, v->str,
462 length < v->length ? length : v->length);
463 Py_DECREF(*unicode);
464 *unicode = w;
465 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000466 }
467
468 /* Note that we don't have to modify *unicode for unshared Unicode
469 objects, since we can modify them in-place. */
470 return unicode_resize(v, length);
471}
472
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
474{
475 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
476}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477
Guido van Rossumd57fd912000-03-10 22:53:23 +0000478PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000479 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480{
481 PyUnicodeObject *unicode;
482
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483 /* If the Unicode data is known at construction time, we can apply
484 some optimizations which share commonly used objects. */
485 if (u != NULL) {
486
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 /* Optimization for empty strings */
488 if (size == 0 && unicode_empty != NULL) {
489 Py_INCREF(unicode_empty);
490 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000491 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000492
493 /* Single character Unicode objects in the Latin-1 range are
494 shared when using this constructor */
495 if (size == 1 && *u < 256) {
496 unicode = unicode_latin1[*u];
497 if (!unicode) {
498 unicode = _PyUnicode_New(1);
499 if (!unicode)
500 return NULL;
501 unicode->str[0] = *u;
502 unicode_latin1[*u] = unicode;
503 }
504 Py_INCREF(unicode);
505 return (PyObject *)unicode;
506 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000507 }
Tim Petersced69f82003-09-16 20:30:58 +0000508
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 unicode = _PyUnicode_New(size);
510 if (!unicode)
511 return NULL;
512
513 /* Copy the Unicode data into the new object */
514 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000515 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516
517 return (PyObject *)unicode;
518}
519
Walter Dörwaldd2034312007-05-18 16:29:38 +0000520PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Walter Dörwaldd2034312007-05-18 16:29:38 +0000567PyObject *PyUnicode_FromString(const char *u)
568{
569 size_t size = strlen(u);
570 if (size > PY_SSIZE_T_MAX) {
571 PyErr_SetString(PyExc_OverflowError, "input too long");
572 return NULL;
573 }
574
575 return PyUnicode_FromStringAndSize(u, size);
576}
577
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578#ifdef HAVE_WCHAR_H
579
Mark Dickinson081dfee2009-03-18 14:47:41 +0000580#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
581# define CONVERT_WCHAR_TO_SURROGATES
582#endif
583
584#ifdef CONVERT_WCHAR_TO_SURROGATES
585
586/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
587 to convert from UTF32 to UTF16. */
588
589PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
590 Py_ssize_t size)
591{
592 PyUnicodeObject *unicode;
593 register Py_ssize_t i;
594 Py_ssize_t alloc;
595 const wchar_t *orig_w;
596
597 if (w == NULL) {
598 if (size == 0)
599 return PyUnicode_FromStringAndSize(NULL, 0);
600 PyErr_BadInternalCall();
601 return NULL;
602 }
603
604 if (size == -1) {
605 size = wcslen(w);
606 }
607
608 alloc = size;
609 orig_w = w;
610 for (i = size; i > 0; i--) {
611 if (*w > 0xFFFF)
612 alloc++;
613 w++;
614 }
615 w = orig_w;
616 unicode = _PyUnicode_New(alloc);
617 if (!unicode)
618 return NULL;
619
620 /* Copy the wchar_t data into the new object */
621 {
622 register Py_UNICODE *u;
623 u = PyUnicode_AS_UNICODE(unicode);
624 for (i = size; i > 0; i--) {
625 if (*w > 0xFFFF) {
626 wchar_t ordinal = *w++;
627 ordinal -= 0x10000;
628 *u++ = 0xD800 | (ordinal >> 10);
629 *u++ = 0xDC00 | (ordinal & 0x3FF);
630 }
631 else
632 *u++ = *w++;
633 }
634 }
635 return (PyObject *)unicode;
636}
637
638#else
639
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000641 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000642{
643 PyUnicodeObject *unicode;
644
645 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == 0)
647 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000648 PyErr_BadInternalCall();
649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 }
651
Martin v. Löwis790465f2008-04-05 20:41:37 +0000652 if (size == -1) {
653 size = wcslen(w);
654 }
655
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 unicode = _PyUnicode_New(size);
657 if (!unicode)
658 return NULL;
659
660 /* Copy the wchar_t data into the new object */
661#ifdef HAVE_USABLE_WCHAR_T
662 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000663#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000665 register Py_UNICODE *u;
666 register Py_ssize_t i;
667 u = PyUnicode_AS_UNICODE(unicode);
668 for (i = size; i > 0; i--)
669 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 }
671#endif
672
673 return (PyObject *)unicode;
674}
675
Mark Dickinson081dfee2009-03-18 14:47:41 +0000676#endif /* CONVERT_WCHAR_TO_SURROGATES */
677
678#undef CONVERT_WCHAR_TO_SURROGATES
679
Walter Dörwald346737f2007-05-31 10:44:43 +0000680static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000681makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
682 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000683{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000684 *fmt++ = '%';
685 if (width) {
686 if (zeropad)
687 *fmt++ = '0';
688 fmt += sprintf(fmt, "%d", width);
689 }
690 if (precision)
691 fmt += sprintf(fmt, ".%d", precision);
692 if (longflag)
693 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000694 else if (longlongflag) {
695 /* longlongflag should only ever be nonzero on machines with
696 HAVE_LONG_LONG defined */
697#ifdef HAVE_LONG_LONG
698 char *f = PY_FORMAT_LONG_LONG;
699 while (*f)
700 *fmt++ = *f++;
701#else
702 /* we shouldn't ever get here */
703 assert(0);
704 *fmt++ = 'l';
705#endif
706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000707 else if (size_tflag) {
708 char *f = PY_FORMAT_SIZE_T;
709 while (*f)
710 *fmt++ = *f++;
711 }
712 *fmt++ = c;
713 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000714}
715
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
717
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000718/* size of fixed-size buffer for formatting single arguments */
719#define ITEM_BUFFER_LEN 21
720/* maximum number of characters required for output of %ld. 21 characters
721 allows for 64-bit integers (in decimal) and an optional sign. */
722#define MAX_LONG_CHARS 21
723/* maximum number of characters required for output of %lld.
724 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
725 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
726#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
727
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728PyObject *
729PyUnicode_FromFormatV(const char *format, va_list vargs)
730{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000731 va_list count;
732 Py_ssize_t callcount = 0;
733 PyObject **callresults = NULL;
734 PyObject **callresult = NULL;
735 Py_ssize_t n = 0;
736 int width = 0;
737 int precision = 0;
738 int zeropad;
739 const char* f;
740 Py_UNICODE *s;
741 PyObject *string;
742 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 /* use abuffer instead of buffer, if we need more space
745 * (which can happen if there's a format specifier with width). */
746 char *abuffer = NULL;
747 char *realbuffer;
748 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000749 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000750 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751
752#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000753 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754#else
755#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000756 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759#endif
760#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000761 /* step 1: count the number of %S/%R/%A/%s format specifications
762 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
763 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
764 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000765 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000766 if (*f == '%') {
767 if (*(f+1)=='%')
768 continue;
769 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
770 ++callcount;
771 while (ISDIGIT((unsigned)*f))
772 width = (width*10) + *f++ - '0';
773 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
774 ;
775 if (*f == 's')
776 ++callcount;
777 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000778 }
779 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000780 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000781 if (callcount) {
782 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
783 if (!callresults) {
784 PyErr_NoMemory();
785 return NULL;
786 }
787 callresult = callresults;
788 }
789 /* step 3: figure out how large a buffer we need */
790 for (f = format; *f; f++) {
791 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000792#ifdef HAVE_LONG_LONG
793 int longlongflag = 0;
794#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 const char* p = f;
796 width = 0;
797 while (ISDIGIT((unsigned)*f))
798 width = (width*10) + *f++ - '0';
799 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
800 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000801
Benjamin Peterson14339b62009-01-31 16:36:08 +0000802 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
803 * they don't affect the amount of space we reserve.
804 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000805 if (*f == 'l') {
806 if (f[1] == 'd' || f[1] == 'u') {
807 ++f;
808 }
809#ifdef HAVE_LONG_LONG
810 else if (f[1] == 'l' &&
811 (f[2] == 'd' || f[2] == 'u')) {
812 longlongflag = 1;
813 f += 2;
814 }
815#endif
816 }
817 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000818 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820
Benjamin Peterson14339b62009-01-31 16:36:08 +0000821 switch (*f) {
822 case 'c':
823 (void)va_arg(count, int);
824 /* fall through... */
825 case '%':
826 n++;
827 break;
828 case 'd': case 'u': case 'i': case 'x':
829 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000830#ifdef HAVE_LONG_LONG
831 if (longlongflag) {
832 if (width < MAX_LONG_LONG_CHARS)
833 width = MAX_LONG_LONG_CHARS;
834 }
835 else
836#endif
837 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
838 including sign. Decimal takes the most space. This
839 isn't enough for octal. If a width is specified we
840 need more (which we allocate later). */
841 if (width < MAX_LONG_CHARS)
842 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000843 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000844 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000845 if (abuffersize < width)
846 abuffersize = width;
847 break;
848 case 's':
849 {
850 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000851 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000852 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
853 if (!str)
854 goto fail;
855 n += PyUnicode_GET_SIZE(str);
856 /* Remember the str and switch to the next slot */
857 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000858 break;
859 }
860 case 'U':
861 {
862 PyObject *obj = va_arg(count, PyObject *);
863 assert(obj && PyUnicode_Check(obj));
864 n += PyUnicode_GET_SIZE(obj);
865 break;
866 }
867 case 'V':
868 {
869 PyObject *obj = va_arg(count, PyObject *);
870 const char *str = va_arg(count, const char *);
871 assert(obj || str);
872 assert(!obj || PyUnicode_Check(obj));
873 if (obj)
874 n += PyUnicode_GET_SIZE(obj);
875 else
876 n += strlen(str);
877 break;
878 }
879 case 'S':
880 {
881 PyObject *obj = va_arg(count, PyObject *);
882 PyObject *str;
883 assert(obj);
884 str = PyObject_Str(obj);
885 if (!str)
886 goto fail;
887 n += PyUnicode_GET_SIZE(str);
888 /* Remember the str and switch to the next slot */
889 *callresult++ = str;
890 break;
891 }
892 case 'R':
893 {
894 PyObject *obj = va_arg(count, PyObject *);
895 PyObject *repr;
896 assert(obj);
897 repr = PyObject_Repr(obj);
898 if (!repr)
899 goto fail;
900 n += PyUnicode_GET_SIZE(repr);
901 /* Remember the repr and switch to the next slot */
902 *callresult++ = repr;
903 break;
904 }
905 case 'A':
906 {
907 PyObject *obj = va_arg(count, PyObject *);
908 PyObject *ascii;
909 assert(obj);
910 ascii = PyObject_ASCII(obj);
911 if (!ascii)
912 goto fail;
913 n += PyUnicode_GET_SIZE(ascii);
914 /* Remember the repr and switch to the next slot */
915 *callresult++ = ascii;
916 break;
917 }
918 case 'p':
919 (void) va_arg(count, int);
920 /* maximum 64-bit pointer representation:
921 * 0xffffffffffffffff
922 * so 19 characters is enough.
923 * XXX I count 18 -- what's the extra for?
924 */
925 n += 19;
926 break;
927 default:
928 /* if we stumble upon an unknown
929 formatting code, copy the rest of
930 the format string to the output
931 string. (we cannot just skip the
932 code, since there's no way to know
933 what's in the argument list) */
934 n += strlen(p);
935 goto expand;
936 }
937 } else
938 n++;
939 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000940 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000941 if (abuffersize > ITEM_BUFFER_LEN) {
942 /* add 1 for sprintf's trailing null byte */
943 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000944 if (!abuffer) {
945 PyErr_NoMemory();
946 goto fail;
947 }
948 realbuffer = abuffer;
949 }
950 else
951 realbuffer = buffer;
952 /* step 4: fill the buffer */
953 /* Since we've analyzed how much space we need for the worst case,
954 we don't have to resize the string.
955 There can be no errors beyond this point. */
956 string = PyUnicode_FromUnicode(NULL, n);
957 if (!string)
958 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000959
Benjamin Peterson14339b62009-01-31 16:36:08 +0000960 s = PyUnicode_AS_UNICODE(string);
961 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000962
Benjamin Peterson14339b62009-01-31 16:36:08 +0000963 for (f = format; *f; f++) {
964 if (*f == '%') {
965 const char* p = f++;
966 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000967 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 int size_tflag = 0;
969 zeropad = (*f == '0');
970 /* parse the width.precision part */
971 width = 0;
972 while (ISDIGIT((unsigned)*f))
973 width = (width*10) + *f++ - '0';
974 precision = 0;
975 if (*f == '.') {
976 f++;
977 while (ISDIGIT((unsigned)*f))
978 precision = (precision*10) + *f++ - '0';
979 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000980 /* Handle %ld, %lu, %lld and %llu. */
981 if (*f == 'l') {
982 if (f[1] == 'd' || f[1] == 'u') {
983 longflag = 1;
984 ++f;
985 }
986#ifdef HAVE_LONG_LONG
987 else if (f[1] == 'l' &&
988 (f[2] == 'd' || f[2] == 'u')) {
989 longlongflag = 1;
990 f += 2;
991 }
992#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000993 }
994 /* handle the size_t flag. */
995 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
996 size_tflag = 1;
997 ++f;
998 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000999
Benjamin Peterson14339b62009-01-31 16:36:08 +00001000 switch (*f) {
1001 case 'c':
1002 *s++ = va_arg(vargs, int);
1003 break;
1004 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001005 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1006 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001007 if (longflag)
1008 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001009#ifdef HAVE_LONG_LONG
1010 else if (longlongflag)
1011 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1012#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001013 else if (size_tflag)
1014 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1015 else
1016 sprintf(realbuffer, fmt, va_arg(vargs, int));
1017 appendstring(realbuffer);
1018 break;
1019 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001020 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1021 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001022 if (longflag)
1023 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001024#ifdef HAVE_LONG_LONG
1025 else if (longlongflag)
1026 sprintf(realbuffer, fmt, va_arg(vargs,
1027 unsigned PY_LONG_LONG));
1028#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001029 else if (size_tflag)
1030 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1031 else
1032 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1033 appendstring(realbuffer);
1034 break;
1035 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001036 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 sprintf(realbuffer, fmt, va_arg(vargs, int));
1038 appendstring(realbuffer);
1039 break;
1040 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001041 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001042 sprintf(realbuffer, fmt, va_arg(vargs, int));
1043 appendstring(realbuffer);
1044 break;
1045 case 's':
1046 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001047 /* unused, since we already have the result */
1048 (void) va_arg(vargs, char *);
1049 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1050 PyUnicode_GET_SIZE(*callresult));
1051 s += PyUnicode_GET_SIZE(*callresult);
1052 /* We're done with the unicode()/repr() => forget it */
1053 Py_DECREF(*callresult);
1054 /* switch to next unicode()/repr() result */
1055 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001056 break;
1057 }
1058 case 'U':
1059 {
1060 PyObject *obj = va_arg(vargs, PyObject *);
1061 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1062 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1063 s += size;
1064 break;
1065 }
1066 case 'V':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 const char *str = va_arg(vargs, const char *);
1070 if (obj) {
1071 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1072 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1073 s += size;
1074 } else {
1075 appendstring(str);
1076 }
1077 break;
1078 }
1079 case 'S':
1080 case 'R':
1081 {
1082 Py_UNICODE *ucopy;
1083 Py_ssize_t usize;
1084 Py_ssize_t upos;
1085 /* unused, since we already have the result */
1086 (void) va_arg(vargs, PyObject *);
1087 ucopy = PyUnicode_AS_UNICODE(*callresult);
1088 usize = PyUnicode_GET_SIZE(*callresult);
1089 for (upos = 0; upos<usize;)
1090 *s++ = ucopy[upos++];
1091 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */
1094 ++callresult;
1095 break;
1096 }
1097 case 'p':
1098 sprintf(buffer, "%p", va_arg(vargs, void*));
1099 /* %p is ill-defined: ensure leading 0x. */
1100 if (buffer[1] == 'X')
1101 buffer[1] = 'x';
1102 else if (buffer[1] != 'x') {
1103 memmove(buffer+2, buffer, strlen(buffer)+1);
1104 buffer[0] = '0';
1105 buffer[1] = 'x';
1106 }
1107 appendstring(buffer);
1108 break;
1109 case '%':
1110 *s++ = '%';
1111 break;
1112 default:
1113 appendstring(p);
1114 goto end;
1115 }
1116 } else
1117 *s++ = *f;
1118 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001119
Benjamin Peterson29060642009-01-31 22:14:21 +00001120 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001121 if (callresults)
1122 PyObject_Free(callresults);
1123 if (abuffer)
1124 PyObject_Free(abuffer);
1125 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1126 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001127 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001128 if (callresults) {
1129 PyObject **callresult2 = callresults;
1130 while (callresult2 < callresult) {
1131 Py_DECREF(*callresult2);
1132 ++callresult2;
1133 }
1134 PyObject_Free(callresults);
1135 }
1136 if (abuffer)
1137 PyObject_Free(abuffer);
1138 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001139}
1140
1141#undef appendstring
1142
1143PyObject *
1144PyUnicode_FromFormat(const char *format, ...)
1145{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001146 PyObject* ret;
1147 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148
1149#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001150 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001151#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001153#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 ret = PyUnicode_FromFormatV(format, vargs);
1155 va_end(vargs);
1156 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001157}
1158
Martin v. Löwis18e16552006-02-15 17:27:45 +00001159Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001160 wchar_t *w,
1161 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162{
1163 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001164 PyErr_BadInternalCall();
1165 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001167
1168 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001170 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001171
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172#ifdef HAVE_USABLE_WCHAR_T
1173 memcpy(w, unicode->str, size * sizeof(wchar_t));
1174#else
1175 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001176 register Py_UNICODE *u;
1177 register Py_ssize_t i;
1178 u = PyUnicode_AS_UNICODE(unicode);
1179 for (i = size; i > 0; i--)
1180 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 }
1182#endif
1183
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001184 if (size > PyUnicode_GET_SIZE(unicode))
1185 return PyUnicode_GET_SIZE(unicode);
1186 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001187 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188}
1189
1190#endif
1191
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001192PyObject *PyUnicode_FromOrdinal(int ordinal)
1193{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001194 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001195
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001196 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001197 PyErr_SetString(PyExc_ValueError,
1198 "chr() arg not in range(0x110000)");
1199 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001201
1202#ifndef Py_UNICODE_WIDE
1203 if (ordinal > 0xffff) {
1204 ordinal -= 0x10000;
1205 s[0] = 0xD800 | (ordinal >> 10);
1206 s[1] = 0xDC00 | (ordinal & 0x3FF);
1207 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
1209#endif
1210
Hye-Shik Chang40574832004-04-06 07:24:51 +00001211 s[0] = (Py_UNICODE)ordinal;
1212 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001213}
1214
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215PyObject *PyUnicode_FromObject(register PyObject *obj)
1216{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001217 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001218 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001219 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001220 Py_INCREF(obj);
1221 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001222 }
1223 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001224 /* For a Unicode subtype that's not a Unicode object,
1225 return a true Unicode object with the same data. */
1226 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1227 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001228 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001229 PyErr_Format(PyExc_TypeError,
1230 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001231 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001232 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001233}
1234
1235PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001236 const char *encoding,
1237 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001238{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001239 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001240 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001242
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 PyErr_BadInternalCall();
1245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001247
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001248 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001249 PyErr_SetString(PyExc_TypeError,
1250 "decoding str is not supported");
1251 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001252 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001253
1254 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001255 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001256 s = PyBytes_AS_STRING(obj);
1257 len = PyBytes_GET_SIZE(obj);
1258 }
1259 else if (PyByteArray_Check(obj)) {
1260 s = PyByteArray_AS_STRING(obj);
1261 len = PyByteArray_GET_SIZE(obj);
1262 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001263 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001264 /* Overwrite the error message with something more useful in
1265 case of a TypeError. */
1266 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001267 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001268 "coercing to str: need string or buffer, "
1269 "%.80s found",
1270 Py_TYPE(obj)->tp_name);
1271 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001272 }
Tim Petersced69f82003-09-16 20:30:58 +00001273
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001274 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 Py_INCREF(unicode_empty);
1277 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278 }
Tim Petersced69f82003-09-16 20:30:58 +00001279 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001280 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 return v;
1283
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286}
1287
1288PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001289 Py_ssize_t size,
1290 const char *encoding,
1291 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292{
1293 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001294 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001295 char lower[20]; /* Enough for any encoding name we recognize */
1296 char *l;
1297 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001298
1299 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001300 encoding = PyUnicode_GetDefaultEncoding();
1301
1302 /* Convert encoding to lower case and replace '_' with '-' in order to
1303 catch e.g. UTF_8 */
1304 e = encoding;
1305 l = lower;
1306 while (*e && l < &lower[(sizeof lower) - 2]) {
1307 if (ISUPPER(*e)) {
1308 *l++ = TOLOWER(*e++);
1309 }
1310 else if (*e == '_') {
1311 *l++ = '-';
1312 e++;
1313 }
1314 else {
1315 *l++ = *e++;
1316 }
1317 }
1318 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001319
1320 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001321 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001323 else if ((strcmp(lower, "latin-1") == 0) ||
1324 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001325 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001326#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001327 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001328 return PyUnicode_DecodeMBCS(s, size, errors);
1329#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001330 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001331 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001332 else if (strcmp(lower, "utf-16") == 0)
1333 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1334 else if (strcmp(lower, "utf-32") == 0)
1335 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336
1337 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001338 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001339 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001340 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001341 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342 if (buffer == NULL)
1343 goto onError;
1344 unicode = PyCodec_Decode(buffer, encoding, errors);
1345 if (unicode == NULL)
1346 goto onError;
1347 if (!PyUnicode_Check(unicode)) {
1348 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001349 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001350 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 Py_DECREF(unicode);
1352 goto onError;
1353 }
1354 Py_DECREF(buffer);
1355 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001356
Benjamin Peterson29060642009-01-31 22:14:21 +00001357 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 Py_XDECREF(buffer);
1359 return NULL;
1360}
1361
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001362PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1363 const char *encoding,
1364 const char *errors)
1365{
1366 PyObject *v;
1367
1368 if (!PyUnicode_Check(unicode)) {
1369 PyErr_BadArgument();
1370 goto onError;
1371 }
1372
1373 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001375
1376 /* Decode via the codec registry */
1377 v = PyCodec_Decode(unicode, encoding, errors);
1378 if (v == NULL)
1379 goto onError;
1380 return v;
1381
Benjamin Peterson29060642009-01-31 22:14:21 +00001382 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001383 return NULL;
1384}
1385
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001386PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1387 const char *encoding,
1388 const char *errors)
1389{
1390 PyObject *v;
1391
1392 if (!PyUnicode_Check(unicode)) {
1393 PyErr_BadArgument();
1394 goto onError;
1395 }
1396
1397 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001398 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001399
1400 /* Decode via the codec registry */
1401 v = PyCodec_Decode(unicode, encoding, errors);
1402 if (v == NULL)
1403 goto onError;
1404 if (!PyUnicode_Check(v)) {
1405 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001406 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001407 Py_TYPE(v)->tp_name);
1408 Py_DECREF(v);
1409 goto onError;
1410 }
1411 return v;
1412
Benjamin Peterson29060642009-01-31 22:14:21 +00001413 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001414 return NULL;
1415}
1416
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001418 Py_ssize_t size,
1419 const char *encoding,
1420 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421{
1422 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001423
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 unicode = PyUnicode_FromUnicode(s, size);
1425 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001427 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1428 Py_DECREF(unicode);
1429 return v;
1430}
1431
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001432PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1433 const char *encoding,
1434 const char *errors)
1435{
1436 PyObject *v;
1437
1438 if (!PyUnicode_Check(unicode)) {
1439 PyErr_BadArgument();
1440 goto onError;
1441 }
1442
1443 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001445
1446 /* Encode via the codec registry */
1447 v = PyCodec_Encode(unicode, encoding, errors);
1448 if (v == NULL)
1449 goto onError;
1450 return v;
1451
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001453 return NULL;
1454}
1455
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1457 const char *encoding,
1458 const char *errors)
1459{
1460 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001461
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462 if (!PyUnicode_Check(unicode)) {
1463 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 }
Fred Drakee4315f52000-05-09 19:53:39 +00001466
Tim Petersced69f82003-09-16 20:30:58 +00001467 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001468 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001469
1470 /* Shortcuts for common default encodings */
1471 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001472 if (strcmp(encoding, "utf-8") == 0)
1473 return PyUnicode_AsUTF8String(unicode);
1474 else if (strcmp(encoding, "latin-1") == 0)
1475 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001476#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001477 else if (strcmp(encoding, "mbcs") == 0)
1478 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001479#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 else if (strcmp(encoding, "ascii") == 0)
1481 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001482 /* During bootstrap, we may need to find the encodings
1483 package, to load the file system encoding, and require the
1484 file system encoding in order to load the encodings
1485 package.
1486
1487 Break out of this dependency by assuming that the path to
1488 the encodings module is ASCII-only. XXX could try wcstombs
1489 instead, if the file system encoding is the locale's
1490 encoding. */
1491 else if (Py_FileSystemDefaultEncoding &&
1492 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1493 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496
1497 /* Encode via the codec registry */
1498 v = PyCodec_Encode(unicode, encoding, errors);
1499 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001500 return NULL;
1501
1502 /* The normal path */
1503 if (PyBytes_Check(v))
1504 return v;
1505
1506 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001507 if (PyByteArray_Check(v)) {
1508 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001509 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001510 PyOS_snprintf(msg, sizeof(msg),
1511 "encoder %s returned buffer instead of bytes",
1512 encoding);
1513 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001514 Py_DECREF(v);
1515 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001516 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001517
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001518 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1519 Py_DECREF(v);
1520 return b;
1521 }
1522
1523 PyErr_Format(PyExc_TypeError,
1524 "encoder did not return a bytes object (type=%.400s)",
1525 Py_TYPE(v)->tp_name);
1526 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001527 return NULL;
1528}
1529
1530PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1531 const char *encoding,
1532 const char *errors)
1533{
1534 PyObject *v;
1535
1536 if (!PyUnicode_Check(unicode)) {
1537 PyErr_BadArgument();
1538 goto onError;
1539 }
1540
1541 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001542 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001543
1544 /* Encode via the codec registry */
1545 v = PyCodec_Encode(unicode, encoding, errors);
1546 if (v == NULL)
1547 goto onError;
1548 if (!PyUnicode_Check(v)) {
1549 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001550 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551 Py_TYPE(v)->tp_name);
1552 Py_DECREF(v);
1553 goto onError;
1554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001556
Benjamin Peterson29060642009-01-31 22:14:21 +00001557 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 return NULL;
1559}
1560
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001561PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001562 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001563{
1564 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001565 if (v)
1566 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001567 if (errors != NULL)
1568 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001569 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001570 PyUnicode_GET_SIZE(unicode),
1571 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001572 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001573 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001574 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001575 return v;
1576}
1577
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001578PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001579PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001580 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001581 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1582}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001583
Christian Heimes5894ba72007-11-04 11:43:14 +00001584PyObject*
1585PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1586{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001587 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1588 can be undefined. If it is case, decode using UTF-8. The following assumes
1589 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1590 bootstrapping process where the codecs aren't ready yet.
1591 */
1592 if (Py_FileSystemDefaultEncoding) {
1593#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001594 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001595 return PyUnicode_DecodeMBCS(s, size, "replace");
1596 }
1597#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001598 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001599 return PyUnicode_DecodeUTF8(s, size, "replace");
1600 }
1601#endif
1602 return PyUnicode_Decode(s, size,
1603 Py_FileSystemDefaultEncoding,
1604 "replace");
1605 }
1606 else {
1607 return PyUnicode_DecodeUTF8(s, size, "replace");
1608 }
1609}
1610
Martin v. Löwis011e8422009-05-05 04:43:17 +00001611/* Convert the argument to a bytes object, according to the file
1612 system encoding */
1613
1614int
1615PyUnicode_FSConverter(PyObject* arg, void* addr)
1616{
1617 PyObject *output = NULL;
1618 Py_ssize_t size;
1619 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001620 if (arg == NULL) {
1621 Py_DECREF(*(PyObject**)addr);
1622 return 1;
1623 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001624 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1625 output = arg;
1626 Py_INCREF(output);
1627 }
1628 else {
1629 arg = PyUnicode_FromObject(arg);
1630 if (!arg)
1631 return 0;
1632 output = PyUnicode_AsEncodedObject(arg,
1633 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001634 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001635 Py_DECREF(arg);
1636 if (!output)
1637 return 0;
1638 if (!PyBytes_Check(output)) {
1639 Py_DECREF(output);
1640 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1641 return 0;
1642 }
1643 }
1644 if (PyBytes_Check(output)) {
1645 size = PyBytes_GET_SIZE(output);
1646 data = PyBytes_AS_STRING(output);
1647 }
1648 else {
1649 size = PyByteArray_GET_SIZE(output);
1650 data = PyByteArray_AS_STRING(output);
1651 }
1652 if (size != strlen(data)) {
1653 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1654 Py_DECREF(output);
1655 return 0;
1656 }
1657 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001658 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001659}
1660
1661
Martin v. Löwis5b222132007-06-10 09:51:05 +00001662char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001663_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001664{
Christian Heimesf3863112007-11-22 07:46:41 +00001665 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001666 if (!PyUnicode_Check(unicode)) {
1667 PyErr_BadArgument();
1668 return NULL;
1669 }
Christian Heimesf3863112007-11-22 07:46:41 +00001670 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1671 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001672 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001673 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001674 *psize = PyBytes_GET_SIZE(bytes);
1675 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001676}
1677
1678char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001679_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001680{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001681 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001682}
1683
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1685{
1686 if (!PyUnicode_Check(unicode)) {
1687 PyErr_BadArgument();
1688 goto onError;
1689 }
1690 return PyUnicode_AS_UNICODE(unicode);
1691
Benjamin Peterson29060642009-01-31 22:14:21 +00001692 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 return NULL;
1694}
1695
Martin v. Löwis18e16552006-02-15 17:27:45 +00001696Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697{
1698 if (!PyUnicode_Check(unicode)) {
1699 PyErr_BadArgument();
1700 goto onError;
1701 }
1702 return PyUnicode_GET_SIZE(unicode);
1703
Benjamin Peterson29060642009-01-31 22:14:21 +00001704 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705 return -1;
1706}
1707
Thomas Wouters78890102000-07-22 19:25:51 +00001708const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001709{
1710 return unicode_default_encoding;
1711}
1712
1713int PyUnicode_SetDefaultEncoding(const char *encoding)
1714{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001715 if (strcmp(encoding, unicode_default_encoding) != 0) {
1716 PyErr_Format(PyExc_ValueError,
1717 "Can only set default encoding to %s",
1718 unicode_default_encoding);
1719 return -1;
1720 }
Fred Drakee4315f52000-05-09 19:53:39 +00001721 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001722}
1723
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001724/* error handling callback helper:
1725 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001726 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001727 and adjust various state variables.
1728 return 0 on success, -1 on error
1729*/
1730
1731static
1732int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001733 const char *encoding, const char *reason,
1734 const char **input, const char **inend, Py_ssize_t *startinpos,
1735 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1736 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001738 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739
1740 PyObject *restuple = NULL;
1741 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001742 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001743 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001744 Py_ssize_t requiredsize;
1745 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001747 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001748 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001749 int res = -1;
1750
1751 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001752 *errorHandler = PyCodec_LookupError(errors);
1753 if (*errorHandler == NULL)
1754 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001755 }
1756
1757 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001758 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001759 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1760 if (*exceptionObject == NULL)
1761 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001762 }
1763 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1765 goto onError;
1766 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1767 goto onError;
1768 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1769 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770 }
1771
1772 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1773 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001774 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001776 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001777 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 }
1779 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001781
1782 /* Copy back the bytes variables, which might have been modified by the
1783 callback */
1784 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1785 if (!inputobj)
1786 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001787 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001789 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001790 *input = PyBytes_AS_STRING(inputobj);
1791 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001792 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001793 /* we can DECREF safely, as the exception has another reference,
1794 so the object won't go away. */
1795 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001796
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001797 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001799 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001800 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1801 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001802 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803
1804 /* need more space? (at least enough for what we
1805 have+the replacement+the rest of the string (starting
1806 at the new input position), so we won't have to check space
1807 when there are no errors in the rest of the string) */
1808 repptr = PyUnicode_AS_UNICODE(repunicode);
1809 repsize = PyUnicode_GET_SIZE(repunicode);
1810 requiredsize = *outpos + repsize + insize-newpos;
1811 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001812 if (requiredsize<2*outsize)
1813 requiredsize = 2*outsize;
1814 if (_PyUnicode_Resize(output, requiredsize) < 0)
1815 goto onError;
1816 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 }
1818 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001819 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 Py_UNICODE_COPY(*outptr, repptr, repsize);
1821 *outptr += repsize;
1822 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 /* we made it! */
1825 res = 0;
1826
Benjamin Peterson29060642009-01-31 22:14:21 +00001827 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 Py_XDECREF(restuple);
1829 return res;
1830}
1831
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001832/* --- UTF-7 Codec -------------------------------------------------------- */
1833
Antoine Pitrou244651a2009-05-04 18:56:13 +00001834/* See RFC2152 for details. We encode conservatively and decode liberally. */
1835
1836/* Three simple macros defining base-64. */
1837
1838/* Is c a base-64 character? */
1839
1840#define IS_BASE64(c) \
1841 (((c) >= 'A' && (c) <= 'Z') || \
1842 ((c) >= 'a' && (c) <= 'z') || \
1843 ((c) >= '0' && (c) <= '9') || \
1844 (c) == '+' || (c) == '/')
1845
1846/* given that c is a base-64 character, what is its base-64 value? */
1847
1848#define FROM_BASE64(c) \
1849 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1850 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1851 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1852 (c) == '+' ? 62 : 63)
1853
1854/* What is the base-64 character of the bottom 6 bits of n? */
1855
1856#define TO_BASE64(n) \
1857 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1858
1859/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1860 * decoded as itself. We are permissive on decoding; the only ASCII
1861 * byte not decoding to itself is the + which begins a base64
1862 * string. */
1863
1864#define DECODE_DIRECT(c) \
1865 ((c) <= 127 && (c) != '+')
1866
1867/* The UTF-7 encoder treats ASCII characters differently according to
1868 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1869 * the above). See RFC2152. This array identifies these different
1870 * sets:
1871 * 0 : "Set D"
1872 * alphanumeric and '(),-./:?
1873 * 1 : "Set O"
1874 * !"#$%&*;<=>@[]^_`{|}
1875 * 2 : "whitespace"
1876 * ht nl cr sp
1877 * 3 : special (must be base64 encoded)
1878 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1879 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001880
Tim Petersced69f82003-09-16 20:30:58 +00001881static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001882char utf7_category[128] = {
1883/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1884 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1885/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1886 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1887/* sp ! " # $ % & ' ( ) * + , - . / */
1888 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1889/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1890 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1891/* @ A B C D E F G H I J K L M N O */
1892 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1893/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1895/* ` a b c d e f g h i j k l m n o */
1896 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1897/* p q r s t u v w x y z { | } ~ del */
1898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001899};
1900
Antoine Pitrou244651a2009-05-04 18:56:13 +00001901/* ENCODE_DIRECT: this character should be encoded as itself. The
1902 * answer depends on whether we are encoding set O as itself, and also
1903 * on whether we are encoding whitespace as itself. RFC2152 makes it
1904 * clear that the answers to these questions vary between
1905 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001906
Antoine Pitrou244651a2009-05-04 18:56:13 +00001907#define ENCODE_DIRECT(c, directO, directWS) \
1908 ((c) < 128 && (c) > 0 && \
1909 ((utf7_category[(c)] == 0) || \
1910 (directWS && (utf7_category[(c)] == 2)) || \
1911 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001912
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001913PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001914 Py_ssize_t size,
1915 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001916{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001917 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1918}
1919
Antoine Pitrou244651a2009-05-04 18:56:13 +00001920/* The decoder. The only state we preserve is our read position,
1921 * i.e. how many characters we have consumed. So if we end in the
1922 * middle of a shift sequence we have to back off the read position
1923 * and the output to the beginning of the sequence, otherwise we lose
1924 * all the shift state (seen bits, number of bits seen, high
1925 * surrogate). */
1926
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001927PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001928 Py_ssize_t size,
1929 const char *errors,
1930 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001931{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001933 Py_ssize_t startinpos;
1934 Py_ssize_t endinpos;
1935 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001936 const char *e;
1937 PyUnicodeObject *unicode;
1938 Py_UNICODE *p;
1939 const char *errmsg = "";
1940 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001941 Py_UNICODE *shiftOutStart;
1942 unsigned int base64bits = 0;
1943 unsigned long base64buffer = 0;
1944 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001945 PyObject *errorHandler = NULL;
1946 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001947
1948 unicode = _PyUnicode_New(size);
1949 if (!unicode)
1950 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001951 if (size == 0) {
1952 if (consumed)
1953 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001954 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001955 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956
1957 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001958 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001959 e = s + size;
1960
1961 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001963 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001964 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965
Antoine Pitrou244651a2009-05-04 18:56:13 +00001966 if (inShift) { /* in a base-64 section */
1967 if (IS_BASE64(ch)) { /* consume a base-64 character */
1968 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1969 base64bits += 6;
1970 s++;
1971 if (base64bits >= 16) {
1972 /* we have enough bits for a UTF-16 value */
1973 Py_UNICODE outCh = (Py_UNICODE)
1974 (base64buffer >> (base64bits-16));
1975 base64bits -= 16;
1976 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1977 if (surrogate) {
1978 /* expecting a second surrogate */
1979 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1980#ifdef Py_UNICODE_WIDE
1981 *p++ = (((surrogate & 0x3FF)<<10)
1982 | (outCh & 0x3FF)) + 0x10000;
1983#else
1984 *p++ = surrogate;
1985 *p++ = outCh;
1986#endif
1987 surrogate = 0;
1988 }
1989 else {
1990 surrogate = 0;
1991 errmsg = "second surrogate missing";
1992 goto utf7Error;
1993 }
1994 }
1995 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1996 /* first surrogate */
1997 surrogate = outCh;
1998 }
1999 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2000 errmsg = "unexpected second surrogate";
2001 goto utf7Error;
2002 }
2003 else {
2004 *p++ = outCh;
2005 }
2006 }
2007 }
2008 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002009 inShift = 0;
2010 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002011 if (surrogate) {
2012 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002013 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002014 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002015 if (base64bits > 0) { /* left-over bits */
2016 if (base64bits >= 6) {
2017 /* We've seen at least one base-64 character */
2018 errmsg = "partial character in shift sequence";
2019 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002020 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002021 else {
2022 /* Some bits remain; they should be zero */
2023 if (base64buffer != 0) {
2024 errmsg = "non-zero padding bits in shift sequence";
2025 goto utf7Error;
2026 }
2027 }
2028 }
2029 if (ch != '-') {
2030 /* '-' is absorbed; other terminating
2031 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002032 *p++ = ch;
2033 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002034 }
2035 }
2036 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002038 s++; /* consume '+' */
2039 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002040 s++;
2041 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002042 }
2043 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002044 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002045 shiftOutStart = p;
2046 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002047 }
2048 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002049 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002050 *p++ = ch;
2051 s++;
2052 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002053 else {
2054 startinpos = s-starts;
2055 s++;
2056 errmsg = "unexpected special character";
2057 goto utf7Error;
2058 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002059 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002060utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 outpos = p-PyUnicode_AS_UNICODE(unicode);
2062 endinpos = s-starts;
2063 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002064 errors, &errorHandler,
2065 "utf7", errmsg,
2066 &starts, &e, &startinpos, &endinpos, &exc, &s,
2067 &unicode, &outpos, &p))
2068 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002069 }
2070
Antoine Pitrou244651a2009-05-04 18:56:13 +00002071 /* end of string */
2072
2073 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2074 /* if we're in an inconsistent state, that's an error */
2075 if (surrogate ||
2076 (base64bits >= 6) ||
2077 (base64bits > 0 && base64buffer != 0)) {
2078 outpos = p-PyUnicode_AS_UNICODE(unicode);
2079 endinpos = size;
2080 if (unicode_decode_call_errorhandler(
2081 errors, &errorHandler,
2082 "utf7", "unterminated shift sequence",
2083 &starts, &e, &startinpos, &endinpos, &exc, &s,
2084 &unicode, &outpos, &p))
2085 goto onError;
2086 if (s < e)
2087 goto restart;
2088 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002089 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002090
2091 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002092 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002093 if (inShift) {
2094 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002095 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002096 }
2097 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002098 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002099 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002100 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002101
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002102 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002103 goto onError;
2104
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002105 Py_XDECREF(errorHandler);
2106 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002107 return (PyObject *)unicode;
2108
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002110 Py_XDECREF(errorHandler);
2111 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002112 Py_DECREF(unicode);
2113 return NULL;
2114}
2115
2116
2117PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002118 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002119 int base64SetO,
2120 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002121 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002122{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002123 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002124 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002125 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002126 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002127 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002128 unsigned int base64bits = 0;
2129 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002130 char * out;
2131 char * start;
2132
2133 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002134 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002136 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002137 return PyErr_NoMemory();
2138
Antoine Pitrou244651a2009-05-04 18:56:13 +00002139 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002140 if (v == NULL)
2141 return NULL;
2142
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002143 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002144 for (;i < size; ++i) {
2145 Py_UNICODE ch = s[i];
2146
Antoine Pitrou244651a2009-05-04 18:56:13 +00002147 if (inShift) {
2148 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2149 /* shifting out */
2150 if (base64bits) { /* output remaining bits */
2151 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2152 base64buffer = 0;
2153 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002154 }
2155 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002156 /* Characters not in the BASE64 set implicitly unshift the sequence
2157 so no '-' is required, except if the character is itself a '-' */
2158 if (IS_BASE64(ch) || ch == '-') {
2159 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002160 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002161 *out++ = (char) ch;
2162 }
2163 else {
2164 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002165 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002166 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002167 else { /* not in a shift sequence */
2168 if (ch == '+') {
2169 *out++ = '+';
2170 *out++ = '-';
2171 }
2172 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2173 *out++ = (char) ch;
2174 }
2175 else {
2176 *out++ = '+';
2177 inShift = 1;
2178 goto encode_char;
2179 }
2180 }
2181 continue;
2182encode_char:
2183#ifdef Py_UNICODE_WIDE
2184 if (ch >= 0x10000) {
2185 /* code first surrogate */
2186 base64bits += 16;
2187 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2188 while (base64bits >= 6) {
2189 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2190 base64bits -= 6;
2191 }
2192 /* prepare second surrogate */
2193 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2194 }
2195#endif
2196 base64bits += 16;
2197 base64buffer = (base64buffer << 16) | ch;
2198 while (base64bits >= 6) {
2199 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2200 base64bits -= 6;
2201 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002202 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002203 if (base64bits)
2204 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2205 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002207 if (_PyBytes_Resize(&v, out - start) < 0)
2208 return NULL;
2209 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002210}
2211
Antoine Pitrou244651a2009-05-04 18:56:13 +00002212#undef IS_BASE64
2213#undef FROM_BASE64
2214#undef TO_BASE64
2215#undef DECODE_DIRECT
2216#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002217
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218/* --- UTF-8 Codec -------------------------------------------------------- */
2219
Tim Petersced69f82003-09-16 20:30:58 +00002220static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221char utf8_code_length[256] = {
2222 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2223 illegal prefix. see RFC 2279 for details */
2224 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2225 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2226 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2227 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2228 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2229 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2230 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2236 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2237 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2238 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2239 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2240};
2241
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002243 Py_ssize_t size,
2244 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245{
Walter Dörwald69652032004-09-07 20:24:22 +00002246 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2247}
2248
Antoine Pitrouab868312009-01-10 15:40:25 +00002249/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2250#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2251
2252/* Mask to quickly check whether a C 'long' contains a
2253 non-ASCII, UTF8-encoded char. */
2254#if (SIZEOF_LONG == 8)
2255# define ASCII_CHAR_MASK 0x8080808080808080L
2256#elif (SIZEOF_LONG == 4)
2257# define ASCII_CHAR_MASK 0x80808080L
2258#else
2259# error C 'long' size should be either 4 or 8!
2260#endif
2261
Walter Dörwald69652032004-09-07 20:24:22 +00002262PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002263 Py_ssize_t size,
2264 const char *errors,
2265 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002266{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002269 Py_ssize_t startinpos;
2270 Py_ssize_t endinpos;
2271 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002272 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 PyUnicodeObject *unicode;
2274 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002275 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 PyObject *errorHandler = NULL;
2277 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278
2279 /* Note: size will always be longer than the resulting Unicode
2280 character count */
2281 unicode = _PyUnicode_New(size);
2282 if (!unicode)
2283 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002284 if (size == 0) {
2285 if (consumed)
2286 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289
2290 /* Unpack UTF-8 encoded data */
2291 p = unicode->str;
2292 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002293 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294
2295 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002296 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297
2298 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002299 /* Fast path for runs of ASCII characters. Given that common UTF-8
2300 input will consist of an overwhelming majority of ASCII
2301 characters, we try to optimize for this case by checking
2302 as many characters as a C 'long' can contain.
2303 First, check if we can do an aligned read, as most CPUs have
2304 a penalty for unaligned reads.
2305 */
2306 if (!((size_t) s & LONG_PTR_MASK)) {
2307 /* Help register allocation */
2308 register const char *_s = s;
2309 register Py_UNICODE *_p = p;
2310 while (_s < aligned_end) {
2311 /* Read a whole long at a time (either 4 or 8 bytes),
2312 and do a fast unrolled copy if it only contains ASCII
2313 characters. */
2314 unsigned long data = *(unsigned long *) _s;
2315 if (data & ASCII_CHAR_MASK)
2316 break;
2317 _p[0] = (unsigned char) _s[0];
2318 _p[1] = (unsigned char) _s[1];
2319 _p[2] = (unsigned char) _s[2];
2320 _p[3] = (unsigned char) _s[3];
2321#if (SIZEOF_LONG == 8)
2322 _p[4] = (unsigned char) _s[4];
2323 _p[5] = (unsigned char) _s[5];
2324 _p[6] = (unsigned char) _s[6];
2325 _p[7] = (unsigned char) _s[7];
2326#endif
2327 _s += SIZEOF_LONG;
2328 _p += SIZEOF_LONG;
2329 }
2330 s = _s;
2331 p = _p;
2332 if (s == e)
2333 break;
2334 ch = (unsigned char)*s;
2335 }
2336 }
2337
2338 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002339 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340 s++;
2341 continue;
2342 }
2343
2344 n = utf8_code_length[ch];
2345
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002346 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002347 if (consumed)
2348 break;
2349 else {
2350 errmsg = "unexpected end of data";
2351 startinpos = s-starts;
2352 endinpos = size;
2353 goto utf8Error;
2354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002355 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356
2357 switch (n) {
2358
2359 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002360 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002361 startinpos = s-starts;
2362 endinpos = startinpos+1;
2363 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364
2365 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002366 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002367 startinpos = s-starts;
2368 endinpos = startinpos+1;
2369 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370
2371 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002372 if ((s[1] & 0xc0) != 0x80) {
2373 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002374 startinpos = s-starts;
2375 endinpos = startinpos+2;
2376 goto utf8Error;
2377 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002379 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002380 startinpos = s-starts;
2381 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002382 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002383 goto utf8Error;
2384 }
2385 else
2386 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 break;
2388
2389 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002390 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002391 (s[2] & 0xc0) != 0x80) {
2392 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002393 startinpos = s-starts;
2394 endinpos = startinpos+3;
2395 goto utf8Error;
2396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002398 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002399 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002400 startinpos = s-starts;
2401 endinpos = startinpos+3;
2402 goto utf8Error;
2403 }
2404 else
2405 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002406 break;
2407
2408 case 4:
2409 if ((s[1] & 0xc0) != 0x80 ||
2410 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002411 (s[3] & 0xc0) != 0x80) {
2412 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002413 startinpos = s-starts;
2414 endinpos = startinpos+4;
2415 goto utf8Error;
2416 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002417 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002418 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002419 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002420 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002422 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002423 UTF-16 */
2424 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002425 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002426 startinpos = s-starts;
2427 endinpos = startinpos+4;
2428 goto utf8Error;
2429 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002430#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002431 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002432#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002433 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002434
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002435 /* translate from 10000..10FFFF to 0..FFFF */
2436 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002437
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002438 /* high surrogate = top 10 bits added to D800 */
2439 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002440
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002441 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002442 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002443#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444 break;
2445
2446 default:
2447 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002448 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002449 startinpos = s-starts;
2450 endinpos = startinpos+n;
2451 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452 }
2453 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002454 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002455
Benjamin Peterson29060642009-01-31 22:14:21 +00002456 utf8Error:
2457 outpos = p-PyUnicode_AS_UNICODE(unicode);
2458 if (unicode_decode_call_errorhandler(
2459 errors, &errorHandler,
2460 "utf8", errmsg,
2461 &starts, &e, &startinpos, &endinpos, &exc, &s,
2462 &unicode, &outpos, &p))
2463 goto onError;
2464 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 }
Walter Dörwald69652032004-09-07 20:24:22 +00002466 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002467 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468
2469 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002470 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 goto onError;
2472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 Py_XDECREF(errorHandler);
2474 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475 return (PyObject *)unicode;
2476
Benjamin Peterson29060642009-01-31 22:14:21 +00002477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 Py_XDECREF(errorHandler);
2479 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 Py_DECREF(unicode);
2481 return NULL;
2482}
2483
Antoine Pitrouab868312009-01-10 15:40:25 +00002484#undef ASCII_CHAR_MASK
2485
2486
Tim Peters602f7402002-04-27 18:03:26 +00002487/* Allocation strategy: if the string is short, convert into a stack buffer
2488 and allocate exactly as much space needed at the end. Else allocate the
2489 maximum possible needed (4 result bytes per Unicode character), and return
2490 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002491*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002492PyObject *
2493PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 Py_ssize_t size,
2495 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496{
Tim Peters602f7402002-04-27 18:03:26 +00002497#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002498
Guido van Rossum98297ee2007-11-06 21:34:58 +00002499 Py_ssize_t i; /* index into s of next input byte */
2500 PyObject *result; /* result string object */
2501 char *p; /* next free byte in output buffer */
2502 Py_ssize_t nallocated; /* number of result bytes allocated */
2503 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002504 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002505 PyObject *errorHandler = NULL;
2506 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002507
Tim Peters602f7402002-04-27 18:03:26 +00002508 assert(s != NULL);
2509 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510
Tim Peters602f7402002-04-27 18:03:26 +00002511 if (size <= MAX_SHORT_UNICHARS) {
2512 /* Write into the stack buffer; nallocated can't overflow.
2513 * At the end, we'll allocate exactly as much heap space as it
2514 * turns out we need.
2515 */
2516 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002517 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002518 p = stackbuf;
2519 }
2520 else {
2521 /* Overallocate on the heap, and give the excess back at the end. */
2522 nallocated = size * 4;
2523 if (nallocated / 4 != size) /* overflow! */
2524 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002525 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002526 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002527 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002528 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002529 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002530
Tim Peters602f7402002-04-27 18:03:26 +00002531 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002532 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002533
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002534 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002535 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002537
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002539 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002540 *p++ = (char)(0xc0 | (ch >> 6));
2541 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002542 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002543 else {
Tim Peters602f7402002-04-27 18:03:26 +00002544 /* Encode UCS2 Unicode ordinals */
2545 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002546#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002547 /* Special case: check for high surrogate */
2548 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2549 Py_UCS4 ch2 = s[i];
2550 /* Check for low surrogate and combine the two to
2551 form a UCS4 value */
2552 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002553 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002554 i++;
2555 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002556 }
Tim Peters602f7402002-04-27 18:03:26 +00002557 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002558 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002559#endif
2560 if (ch >= 0xd800 && ch <= 0xdfff) {
2561 Py_ssize_t newpos;
2562 PyObject *rep;
2563 char *prep;
2564 int k;
2565 rep = unicode_encode_call_errorhandler
2566 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2567 s, size, &exc, i-1, i, &newpos);
2568 if (!rep)
2569 goto error;
2570 /* Implementation limitations: only support error handler that return
2571 bytes, and only support up to four replacement bytes. */
2572 if (!PyBytes_Check(rep)) {
2573 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2574 Py_DECREF(rep);
2575 goto error;
2576 }
2577 if (PyBytes_Size(rep) > 4) {
2578 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2579 Py_DECREF(rep);
2580 goto error;
2581 }
2582 prep = PyBytes_AsString(rep);
2583 for(k = PyBytes_Size(rep); k > 0; k--)
2584 *p++ = *prep++;
2585 Py_DECREF(rep);
2586 continue;
2587
2588 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002589 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002590 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2591 *p++ = (char)(0x80 | (ch & 0x3f));
2592 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002593 }
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002594#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002595 encodeUCS4:
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002596#endif
Tim Peters602f7402002-04-27 18:03:26 +00002597 /* Encode UCS4 Unicode ordinals */
2598 *p++ = (char)(0xf0 | (ch >> 18));
2599 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2600 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2601 *p++ = (char)(0x80 | (ch & 0x3f));
2602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002604
Guido van Rossum98297ee2007-11-06 21:34:58 +00002605 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002606 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002607 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002608 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002609 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002610 }
2611 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002612 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002613 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002614 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002615 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002616 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002617 Py_XDECREF(errorHandler);
2618 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002619 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002620 error:
2621 Py_XDECREF(errorHandler);
2622 Py_XDECREF(exc);
2623 Py_XDECREF(result);
2624 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002625
Tim Peters602f7402002-04-27 18:03:26 +00002626#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627}
2628
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2630{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 if (!PyUnicode_Check(unicode)) {
2632 PyErr_BadArgument();
2633 return NULL;
2634 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002635 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002636 PyUnicode_GET_SIZE(unicode),
2637 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638}
2639
Walter Dörwald41980ca2007-08-16 21:55:45 +00002640/* --- UTF-32 Codec ------------------------------------------------------- */
2641
2642PyObject *
2643PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002644 Py_ssize_t size,
2645 const char *errors,
2646 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002647{
2648 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2649}
2650
2651PyObject *
2652PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002653 Py_ssize_t size,
2654 const char *errors,
2655 int *byteorder,
2656 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002657{
2658 const char *starts = s;
2659 Py_ssize_t startinpos;
2660 Py_ssize_t endinpos;
2661 Py_ssize_t outpos;
2662 PyUnicodeObject *unicode;
2663 Py_UNICODE *p;
2664#ifndef Py_UNICODE_WIDE
2665 int i, pairs;
2666#else
2667 const int pairs = 0;
2668#endif
2669 const unsigned char *q, *e;
2670 int bo = 0; /* assume native ordering by default */
2671 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002672 /* Offsets from q for retrieving bytes in the right order. */
2673#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2674 int iorder[] = {0, 1, 2, 3};
2675#else
2676 int iorder[] = {3, 2, 1, 0};
2677#endif
2678 PyObject *errorHandler = NULL;
2679 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002680 /* On narrow builds we split characters outside the BMP into two
2681 codepoints => count how much extra space we need. */
2682#ifndef Py_UNICODE_WIDE
2683 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002684 if (((Py_UCS4 *)s)[i] >= 0x10000)
2685 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002686#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002687
2688 /* This might be one to much, because of a BOM */
2689 unicode = _PyUnicode_New((size+3)/4+pairs);
2690 if (!unicode)
2691 return NULL;
2692 if (size == 0)
2693 return (PyObject *)unicode;
2694
2695 /* Unpack UTF-32 encoded data */
2696 p = unicode->str;
2697 q = (unsigned char *)s;
2698 e = q + size;
2699
2700 if (byteorder)
2701 bo = *byteorder;
2702
2703 /* Check for BOM marks (U+FEFF) in the input and adjust current
2704 byte order setting accordingly. In native mode, the leading BOM
2705 mark is skipped, in all other modes, it is copied to the output
2706 stream as-is (giving a ZWNBSP character). */
2707 if (bo == 0) {
2708 if (size >= 4) {
2709 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002710 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002711#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002712 if (bom == 0x0000FEFF) {
2713 q += 4;
2714 bo = -1;
2715 }
2716 else if (bom == 0xFFFE0000) {
2717 q += 4;
2718 bo = 1;
2719 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002720#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002721 if (bom == 0x0000FEFF) {
2722 q += 4;
2723 bo = 1;
2724 }
2725 else if (bom == 0xFFFE0000) {
2726 q += 4;
2727 bo = -1;
2728 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002729#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002730 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002731 }
2732
2733 if (bo == -1) {
2734 /* force LE */
2735 iorder[0] = 0;
2736 iorder[1] = 1;
2737 iorder[2] = 2;
2738 iorder[3] = 3;
2739 }
2740 else if (bo == 1) {
2741 /* force BE */
2742 iorder[0] = 3;
2743 iorder[1] = 2;
2744 iorder[2] = 1;
2745 iorder[3] = 0;
2746 }
2747
2748 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 Py_UCS4 ch;
2750 /* remaining bytes at the end? (size should be divisible by 4) */
2751 if (e-q<4) {
2752 if (consumed)
2753 break;
2754 errmsg = "truncated data";
2755 startinpos = ((const char *)q)-starts;
2756 endinpos = ((const char *)e)-starts;
2757 goto utf32Error;
2758 /* The remaining input chars are ignored if the callback
2759 chooses to skip the input */
2760 }
2761 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2762 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002763
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 if (ch >= 0x110000)
2765 {
2766 errmsg = "codepoint not in range(0x110000)";
2767 startinpos = ((const char *)q)-starts;
2768 endinpos = startinpos+4;
2769 goto utf32Error;
2770 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002771#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002772 if (ch >= 0x10000)
2773 {
2774 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2775 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2776 }
2777 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002779 *p++ = ch;
2780 q += 4;
2781 continue;
2782 utf32Error:
2783 outpos = p-PyUnicode_AS_UNICODE(unicode);
2784 if (unicode_decode_call_errorhandler(
2785 errors, &errorHandler,
2786 "utf32", errmsg,
2787 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2788 &unicode, &outpos, &p))
2789 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002790 }
2791
2792 if (byteorder)
2793 *byteorder = bo;
2794
2795 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002796 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002797
2798 /* Adjust length */
2799 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2800 goto onError;
2801
2802 Py_XDECREF(errorHandler);
2803 Py_XDECREF(exc);
2804 return (PyObject *)unicode;
2805
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002807 Py_DECREF(unicode);
2808 Py_XDECREF(errorHandler);
2809 Py_XDECREF(exc);
2810 return NULL;
2811}
2812
2813PyObject *
2814PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002815 Py_ssize_t size,
2816 const char *errors,
2817 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002818{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002819 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002820 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002821 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002822#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002823 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002824#else
2825 const int pairs = 0;
2826#endif
2827 /* Offsets from p for storing byte pairs in the right order. */
2828#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2829 int iorder[] = {0, 1, 2, 3};
2830#else
2831 int iorder[] = {3, 2, 1, 0};
2832#endif
2833
Benjamin Peterson29060642009-01-31 22:14:21 +00002834#define STORECHAR(CH) \
2835 do { \
2836 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2837 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2838 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2839 p[iorder[0]] = (CH) & 0xff; \
2840 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002841 } while(0)
2842
2843 /* In narrow builds we can output surrogate pairs as one codepoint,
2844 so we need less space. */
2845#ifndef Py_UNICODE_WIDE
2846 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2848 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2849 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002850#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002851 nsize = (size - pairs + (byteorder == 0));
2852 bytesize = nsize * 4;
2853 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002855 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002856 if (v == NULL)
2857 return NULL;
2858
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002859 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002860 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002861 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002862 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002863 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002864
2865 if (byteorder == -1) {
2866 /* force LE */
2867 iorder[0] = 0;
2868 iorder[1] = 1;
2869 iorder[2] = 2;
2870 iorder[3] = 3;
2871 }
2872 else if (byteorder == 1) {
2873 /* force BE */
2874 iorder[0] = 3;
2875 iorder[1] = 2;
2876 iorder[2] = 1;
2877 iorder[3] = 0;
2878 }
2879
2880 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002882#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002883 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2884 Py_UCS4 ch2 = *s;
2885 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2886 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2887 s++;
2888 size--;
2889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002890 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002891#endif
2892 STORECHAR(ch);
2893 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002894
2895 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002896 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002897#undef STORECHAR
2898}
2899
2900PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2901{
2902 if (!PyUnicode_Check(unicode)) {
2903 PyErr_BadArgument();
2904 return NULL;
2905 }
2906 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 PyUnicode_GET_SIZE(unicode),
2908 NULL,
2909 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002910}
2911
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912/* --- UTF-16 Codec ------------------------------------------------------- */
2913
Tim Peters772747b2001-08-09 22:21:55 +00002914PyObject *
2915PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002916 Py_ssize_t size,
2917 const char *errors,
2918 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919{
Walter Dörwald69652032004-09-07 20:24:22 +00002920 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2921}
2922
Antoine Pitrouab868312009-01-10 15:40:25 +00002923/* Two masks for fast checking of whether a C 'long' may contain
2924 UTF16-encoded surrogate characters. This is an efficient heuristic,
2925 assuming that non-surrogate characters with a code point >= 0x8000 are
2926 rare in most input.
2927 FAST_CHAR_MASK is used when the input is in native byte ordering,
2928 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002929*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002930#if (SIZEOF_LONG == 8)
2931# define FAST_CHAR_MASK 0x8000800080008000L
2932# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2933#elif (SIZEOF_LONG == 4)
2934# define FAST_CHAR_MASK 0x80008000L
2935# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2936#else
2937# error C 'long' size should be either 4 or 8!
2938#endif
2939
Walter Dörwald69652032004-09-07 20:24:22 +00002940PyObject *
2941PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002942 Py_ssize_t size,
2943 const char *errors,
2944 int *byteorder,
2945 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002946{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002948 Py_ssize_t startinpos;
2949 Py_ssize_t endinpos;
2950 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 PyUnicodeObject *unicode;
2952 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002953 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002954 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002955 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002956 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002957 /* Offsets from q for retrieving byte pairs in the right order. */
2958#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2959 int ihi = 1, ilo = 0;
2960#else
2961 int ihi = 0, ilo = 1;
2962#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 PyObject *errorHandler = NULL;
2964 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965
2966 /* Note: size will always be longer than the resulting Unicode
2967 character count */
2968 unicode = _PyUnicode_New(size);
2969 if (!unicode)
2970 return NULL;
2971 if (size == 0)
2972 return (PyObject *)unicode;
2973
2974 /* Unpack UTF-16 encoded data */
2975 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002976 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002977 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978
2979 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002980 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002982 /* Check for BOM marks (U+FEFF) in the input and adjust current
2983 byte order setting accordingly. In native mode, the leading BOM
2984 mark is skipped, in all other modes, it is copied to the output
2985 stream as-is (giving a ZWNBSP character). */
2986 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002987 if (size >= 2) {
2988 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002989#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002990 if (bom == 0xFEFF) {
2991 q += 2;
2992 bo = -1;
2993 }
2994 else if (bom == 0xFFFE) {
2995 q += 2;
2996 bo = 1;
2997 }
Tim Petersced69f82003-09-16 20:30:58 +00002998#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 if (bom == 0xFEFF) {
3000 q += 2;
3001 bo = 1;
3002 }
3003 else if (bom == 0xFFFE) {
3004 q += 2;
3005 bo = -1;
3006 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003007#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003008 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003010
Tim Peters772747b2001-08-09 22:21:55 +00003011 if (bo == -1) {
3012 /* force LE */
3013 ihi = 1;
3014 ilo = 0;
3015 }
3016 else if (bo == 1) {
3017 /* force BE */
3018 ihi = 0;
3019 ilo = 1;
3020 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003021#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3022 native_ordering = ilo < ihi;
3023#else
3024 native_ordering = ilo > ihi;
3025#endif
Tim Peters772747b2001-08-09 22:21:55 +00003026
Antoine Pitrouab868312009-01-10 15:40:25 +00003027 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003028 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003030 /* First check for possible aligned read of a C 'long'. Unaligned
3031 reads are more expensive, better to defer to another iteration. */
3032 if (!((size_t) q & LONG_PTR_MASK)) {
3033 /* Fast path for runs of non-surrogate chars. */
3034 register const unsigned char *_q = q;
3035 Py_UNICODE *_p = p;
3036 if (native_ordering) {
3037 /* Native ordering is simple: as long as the input cannot
3038 possibly contain a surrogate char, do an unrolled copy
3039 of several 16-bit code points to the target object.
3040 The non-surrogate check is done on several input bytes
3041 at a time (as many as a C 'long' can contain). */
3042 while (_q < aligned_end) {
3043 unsigned long data = * (unsigned long *) _q;
3044 if (data & FAST_CHAR_MASK)
3045 break;
3046 _p[0] = ((unsigned short *) _q)[0];
3047 _p[1] = ((unsigned short *) _q)[1];
3048#if (SIZEOF_LONG == 8)
3049 _p[2] = ((unsigned short *) _q)[2];
3050 _p[3] = ((unsigned short *) _q)[3];
3051#endif
3052 _q += SIZEOF_LONG;
3053 _p += SIZEOF_LONG / 2;
3054 }
3055 }
3056 else {
3057 /* Byteswapped ordering is similar, but we must decompose
3058 the copy bytewise, and take care of zero'ing out the
3059 upper bytes if the target object is in 32-bit units
3060 (that is, in UCS-4 builds). */
3061 while (_q < aligned_end) {
3062 unsigned long data = * (unsigned long *) _q;
3063 if (data & SWAPPED_FAST_CHAR_MASK)
3064 break;
3065 /* Zero upper bytes in UCS-4 builds */
3066#if (Py_UNICODE_SIZE > 2)
3067 _p[0] = 0;
3068 _p[1] = 0;
3069#if (SIZEOF_LONG == 8)
3070 _p[2] = 0;
3071 _p[3] = 0;
3072#endif
3073#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003074 /* Issue #4916; UCS-4 builds on big endian machines must
3075 fill the two last bytes of each 4-byte unit. */
3076#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3077# define OFF 2
3078#else
3079# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003080#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003081 ((unsigned char *) _p)[OFF + 1] = _q[0];
3082 ((unsigned char *) _p)[OFF + 0] = _q[1];
3083 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3084 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3085#if (SIZEOF_LONG == 8)
3086 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3087 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3088 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3089 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3090#endif
3091#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003092 _q += SIZEOF_LONG;
3093 _p += SIZEOF_LONG / 2;
3094 }
3095 }
3096 p = _p;
3097 q = _q;
3098 if (q >= e)
3099 break;
3100 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003101 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003102
Benjamin Peterson14339b62009-01-31 16:36:08 +00003103 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003104
3105 if (ch < 0xD800 || ch > 0xDFFF) {
3106 *p++ = ch;
3107 continue;
3108 }
3109
3110 /* UTF-16 code pair: */
3111 if (q > e) {
3112 errmsg = "unexpected end of data";
3113 startinpos = (((const char *)q) - 2) - starts;
3114 endinpos = ((const char *)e) + 1 - starts;
3115 goto utf16Error;
3116 }
3117 if (0xD800 <= ch && ch <= 0xDBFF) {
3118 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3119 q += 2;
3120 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003121#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003122 *p++ = ch;
3123 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003124#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003126#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 continue;
3128 }
3129 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003130 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003131 startinpos = (((const char *)q)-4)-starts;
3132 endinpos = startinpos+2;
3133 goto utf16Error;
3134 }
3135
Benjamin Peterson14339b62009-01-31 16:36:08 +00003136 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003137 errmsg = "illegal encoding";
3138 startinpos = (((const char *)q)-2)-starts;
3139 endinpos = startinpos+2;
3140 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003141
Benjamin Peterson29060642009-01-31 22:14:21 +00003142 utf16Error:
3143 outpos = p - PyUnicode_AS_UNICODE(unicode);
3144 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003145 errors,
3146 &errorHandler,
3147 "utf16", errmsg,
3148 &starts,
3149 (const char **)&e,
3150 &startinpos,
3151 &endinpos,
3152 &exc,
3153 (const char **)&q,
3154 &unicode,
3155 &outpos,
3156 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003157 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003159 /* remaining byte at the end? (size should be even) */
3160 if (e == q) {
3161 if (!consumed) {
3162 errmsg = "truncated data";
3163 startinpos = ((const char *)q) - starts;
3164 endinpos = ((const char *)e) + 1 - starts;
3165 outpos = p - PyUnicode_AS_UNICODE(unicode);
3166 if (unicode_decode_call_errorhandler(
3167 errors,
3168 &errorHandler,
3169 "utf16", errmsg,
3170 &starts,
3171 (const char **)&e,
3172 &startinpos,
3173 &endinpos,
3174 &exc,
3175 (const char **)&q,
3176 &unicode,
3177 &outpos,
3178 &p))
3179 goto onError;
3180 /* The remaining input chars are ignored if the callback
3181 chooses to skip the input */
3182 }
3183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184
3185 if (byteorder)
3186 *byteorder = bo;
3187
Walter Dörwald69652032004-09-07 20:24:22 +00003188 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003190
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003192 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 goto onError;
3194
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003195 Py_XDECREF(errorHandler);
3196 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 return (PyObject *)unicode;
3198
Benjamin Peterson29060642009-01-31 22:14:21 +00003199 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 Py_XDECREF(errorHandler);
3202 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 return NULL;
3204}
3205
Antoine Pitrouab868312009-01-10 15:40:25 +00003206#undef FAST_CHAR_MASK
3207#undef SWAPPED_FAST_CHAR_MASK
3208
Tim Peters772747b2001-08-09 22:21:55 +00003209PyObject *
3210PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003211 Py_ssize_t size,
3212 const char *errors,
3213 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003215 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003216 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003217 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003218#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003219 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003220#else
3221 const int pairs = 0;
3222#endif
Tim Peters772747b2001-08-09 22:21:55 +00003223 /* Offsets from p for storing byte pairs in the right order. */
3224#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3225 int ihi = 1, ilo = 0;
3226#else
3227 int ihi = 0, ilo = 1;
3228#endif
3229
Benjamin Peterson29060642009-01-31 22:14:21 +00003230#define STORECHAR(CH) \
3231 do { \
3232 p[ihi] = ((CH) >> 8) & 0xff; \
3233 p[ilo] = (CH) & 0xff; \
3234 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003235 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003237#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003238 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 if (s[i] >= 0x10000)
3240 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003241#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003242 /* 2 * (size + pairs + (byteorder == 0)) */
3243 if (size > PY_SSIZE_T_MAX ||
3244 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003245 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003246 nsize = size + pairs + (byteorder == 0);
3247 bytesize = nsize * 2;
3248 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003250 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251 if (v == NULL)
3252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003254 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003257 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003258 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003259
3260 if (byteorder == -1) {
3261 /* force LE */
3262 ihi = 1;
3263 ilo = 0;
3264 }
3265 else if (byteorder == 1) {
3266 /* force BE */
3267 ihi = 0;
3268 ilo = 1;
3269 }
3270
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003271 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003272 Py_UNICODE ch = *s++;
3273 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003274#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 if (ch >= 0x10000) {
3276 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3277 ch = 0xD800 | ((ch-0x10000) >> 10);
3278 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003279#endif
Tim Peters772747b2001-08-09 22:21:55 +00003280 STORECHAR(ch);
3281 if (ch2)
3282 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003283 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003284
3285 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003286 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003287#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288}
3289
3290PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3291{
3292 if (!PyUnicode_Check(unicode)) {
3293 PyErr_BadArgument();
3294 return NULL;
3295 }
3296 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 PyUnicode_GET_SIZE(unicode),
3298 NULL,
3299 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300}
3301
3302/* --- Unicode Escape Codec ----------------------------------------------- */
3303
Fredrik Lundh06d12682001-01-24 07:59:11 +00003304static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003305
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003307 Py_ssize_t size,
3308 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003311 Py_ssize_t startinpos;
3312 Py_ssize_t endinpos;
3313 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003318 char* message;
3319 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 PyObject *errorHandler = NULL;
3321 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003322
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 /* Escaped strings will always be longer than the resulting
3324 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003325 length after conversion to the true value.
3326 (but if the error callback returns a long replacement string
3327 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 v = _PyUnicode_New(size);
3329 if (v == NULL)
3330 goto onError;
3331 if (size == 0)
3332 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003333
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003334 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 while (s < end) {
3338 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003339 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341
3342 /* Non-escape characters are interpreted as Unicode ordinals */
3343 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003344 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 continue;
3346 }
3347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349 /* \ - Escapes */
3350 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003351 c = *s++;
3352 if (s > end)
3353 c = '\0'; /* Invalid after \ */
3354 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355
Benjamin Peterson29060642009-01-31 22:14:21 +00003356 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 case '\n': break;
3358 case '\\': *p++ = '\\'; break;
3359 case '\'': *p++ = '\''; break;
3360 case '\"': *p++ = '\"'; break;
3361 case 'b': *p++ = '\b'; break;
3362 case 'f': *p++ = '\014'; break; /* FF */
3363 case 't': *p++ = '\t'; break;
3364 case 'n': *p++ = '\n'; break;
3365 case 'r': *p++ = '\r'; break;
3366 case 'v': *p++ = '\013'; break; /* VT */
3367 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3368
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370 case '0': case '1': case '2': case '3':
3371 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003372 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003373 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003374 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003375 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003376 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003378 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379 break;
3380
Benjamin Peterson29060642009-01-31 22:14:21 +00003381 /* hex escapes */
3382 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003384 digits = 2;
3385 message = "truncated \\xXX escape";
3386 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387
Benjamin Peterson29060642009-01-31 22:14:21 +00003388 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003390 digits = 4;
3391 message = "truncated \\uXXXX escape";
3392 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003395 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003396 digits = 8;
3397 message = "truncated \\UXXXXXXXX escape";
3398 hexescape:
3399 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 outpos = p-PyUnicode_AS_UNICODE(v);
3401 if (s+digits>end) {
3402 endinpos = size;
3403 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 errors, &errorHandler,
3405 "unicodeescape", "end of string in escape sequence",
3406 &starts, &end, &startinpos, &endinpos, &exc, &s,
3407 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 goto onError;
3409 goto nextByte;
3410 }
3411 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003412 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003413 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414 endinpos = (s+i+1)-starts;
3415 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003416 errors, &errorHandler,
3417 "unicodeescape", message,
3418 &starts, &end, &startinpos, &endinpos, &exc, &s,
3419 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003420 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003422 }
3423 chr = (chr<<4) & ~0xF;
3424 if (c >= '0' && c <= '9')
3425 chr += c - '0';
3426 else if (c >= 'a' && c <= 'f')
3427 chr += 10 + c - 'a';
3428 else
3429 chr += 10 + c - 'A';
3430 }
3431 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003432 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433 /* _decoding_error will have already written into the
3434 target buffer. */
3435 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003436 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003437 /* when we get here, chr is a 32-bit unicode character */
3438 if (chr <= 0xffff)
3439 /* UCS-2 character */
3440 *p++ = (Py_UNICODE) chr;
3441 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003442 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003443 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003444#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003445 *p++ = chr;
3446#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003447 chr -= 0x10000L;
3448 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003449 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003450#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003451 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 endinpos = s-starts;
3453 outpos = p-PyUnicode_AS_UNICODE(v);
3454 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 errors, &errorHandler,
3456 "unicodeescape", "illegal Unicode character",
3457 &starts, &end, &startinpos, &endinpos, &exc, &s,
3458 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003459 goto onError;
3460 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003461 break;
3462
Benjamin Peterson29060642009-01-31 22:14:21 +00003463 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003464 case 'N':
3465 message = "malformed \\N character escape";
3466 if (ucnhash_CAPI == NULL) {
3467 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003468 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003469 if (ucnhash_CAPI == NULL)
3470 goto ucnhashError;
3471 }
3472 if (*s == '{') {
3473 const char *start = s+1;
3474 /* look for the closing brace */
3475 while (*s != '}' && s < end)
3476 s++;
3477 if (s > start && s < end && *s == '}') {
3478 /* found a name. look it up in the unicode database */
3479 message = "unknown Unicode character name";
3480 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003481 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003482 goto store;
3483 }
3484 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485 endinpos = s-starts;
3486 outpos = p-PyUnicode_AS_UNICODE(v);
3487 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003488 errors, &errorHandler,
3489 "unicodeescape", message,
3490 &starts, &end, &startinpos, &endinpos, &exc, &s,
3491 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003492 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003493 break;
3494
3495 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003496 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 message = "\\ at end of string";
3498 s--;
3499 endinpos = s-starts;
3500 outpos = p-PyUnicode_AS_UNICODE(v);
3501 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003502 errors, &errorHandler,
3503 "unicodeescape", message,
3504 &starts, &end, &startinpos, &endinpos, &exc, &s,
3505 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003506 goto onError;
3507 }
3508 else {
3509 *p++ = '\\';
3510 *p++ = (unsigned char)s[-1];
3511 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003512 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003514 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003517 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003519 Py_XDECREF(errorHandler);
3520 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003522
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003524 PyErr_SetString(
3525 PyExc_UnicodeError,
3526 "\\N escapes not supported (can't load unicodedata module)"
3527 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003528 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 Py_XDECREF(errorHandler);
3530 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003531 return NULL;
3532
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 Py_XDECREF(errorHandler);
3536 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 return NULL;
3538}
3539
3540/* Return a Unicode-Escape string version of the Unicode object.
3541
3542 If quotes is true, the string is enclosed in u"" or u'' quotes as
3543 appropriate.
3544
3545*/
3546
Thomas Wouters477c8d52006-05-27 19:21:47 +00003547Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003548 Py_ssize_t size,
3549 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003550{
3551 /* like wcschr, but doesn't stop at NULL characters */
3552
3553 while (size-- > 0) {
3554 if (*s == ch)
3555 return s;
3556 s++;
3557 }
3558
3559 return NULL;
3560}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003561
Walter Dörwald79e913e2007-05-12 11:08:06 +00003562static const char *hexdigits = "0123456789abcdef";
3563
3564PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003567 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003570#ifdef Py_UNICODE_WIDE
3571 const Py_ssize_t expandsize = 10;
3572#else
3573 const Py_ssize_t expandsize = 6;
3574#endif
3575
Thomas Wouters89f507f2006-12-13 04:49:30 +00003576 /* XXX(nnorwitz): rather than over-allocating, it would be
3577 better to choose a different scheme. Perhaps scan the
3578 first N-chars of the string and allocate based on that size.
3579 */
3580 /* Initial allocation is based on the longest-possible unichr
3581 escape.
3582
3583 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3584 unichr, so in this case it's the longest unichr escape. In
3585 narrow (UTF-16) builds this is five chars per source unichr
3586 since there are two unichrs in the surrogate pair, so in narrow
3587 (UTF-16) builds it's not the longest unichr escape.
3588
3589 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3590 so in the narrow (UTF-16) build case it's the longest unichr
3591 escape.
3592 */
3593
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003594 if (size == 0)
3595 return PyBytes_FromStringAndSize(NULL, 0);
3596
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003597 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003599
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003600 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 2
3602 + expandsize*size
3603 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 if (repr == NULL)
3605 return NULL;
3606
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003607 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 while (size-- > 0) {
3610 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003611
Walter Dörwald79e913e2007-05-12 11:08:06 +00003612 /* Escape backslashes */
3613 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 *p++ = '\\';
3615 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003616 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003617 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003618
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003619#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003620 /* Map 21-bit characters to '\U00xxxxxx' */
3621 else if (ch >= 0x10000) {
3622 *p++ = '\\';
3623 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003624 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3625 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3626 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3627 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3628 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3629 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3630 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3631 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003632 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003633 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003634#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3636 else if (ch >= 0xD800 && ch < 0xDC00) {
3637 Py_UNICODE ch2;
3638 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003639
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 ch2 = *s++;
3641 size--;
3642 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3643 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3644 *p++ = '\\';
3645 *p++ = 'U';
3646 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3647 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3648 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3649 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3650 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3651 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3652 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3653 *p++ = hexdigits[ucs & 0x0000000F];
3654 continue;
3655 }
3656 /* Fall through: isolated surrogates are copied as-is */
3657 s--;
3658 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003659 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003660#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003661
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003663 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 *p++ = '\\';
3665 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003666 *p++ = hexdigits[(ch >> 12) & 0x000F];
3667 *p++ = hexdigits[(ch >> 8) & 0x000F];
3668 *p++ = hexdigits[(ch >> 4) & 0x000F];
3669 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003671
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003672 /* Map special whitespace to '\t', \n', '\r' */
3673 else if (ch == '\t') {
3674 *p++ = '\\';
3675 *p++ = 't';
3676 }
3677 else if (ch == '\n') {
3678 *p++ = '\\';
3679 *p++ = 'n';
3680 }
3681 else if (ch == '\r') {
3682 *p++ = '\\';
3683 *p++ = 'r';
3684 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003685
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003686 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003687 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003689 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003690 *p++ = hexdigits[(ch >> 4) & 0x000F];
3691 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003692 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003693
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 /* Copy everything else as-is */
3695 else
3696 *p++ = (char) ch;
3697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003699 assert(p - PyBytes_AS_STRING(repr) > 0);
3700 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3701 return NULL;
3702 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703}
3704
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003705PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003707 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 if (!PyUnicode_Check(unicode)) {
3709 PyErr_BadArgument();
3710 return NULL;
3711 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003712 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3713 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003714 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715}
3716
3717/* --- Raw Unicode Escape Codec ------------------------------------------- */
3718
3719PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003720 Py_ssize_t size,
3721 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003724 Py_ssize_t startinpos;
3725 Py_ssize_t endinpos;
3726 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 const char *end;
3730 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 PyObject *errorHandler = NULL;
3732 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003733
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 /* Escaped strings will always be longer than the resulting
3735 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003736 length after conversion to the true value. (But decoding error
3737 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 v = _PyUnicode_New(size);
3739 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003740 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003742 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 end = s + size;
3745 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 unsigned char c;
3747 Py_UCS4 x;
3748 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003749 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750
Benjamin Peterson29060642009-01-31 22:14:21 +00003751 /* Non-escape characters are interpreted as Unicode ordinals */
3752 if (*s != '\\') {
3753 *p++ = (unsigned char)*s++;
3754 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003755 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 startinpos = s-starts;
3757
3758 /* \u-escapes are only interpreted iff the number of leading
3759 backslashes if odd */
3760 bs = s;
3761 for (;s < end;) {
3762 if (*s != '\\')
3763 break;
3764 *p++ = (unsigned char)*s++;
3765 }
3766 if (((s - bs) & 1) == 0 ||
3767 s >= end ||
3768 (*s != 'u' && *s != 'U')) {
3769 continue;
3770 }
3771 p--;
3772 count = *s=='u' ? 4 : 8;
3773 s++;
3774
3775 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3776 outpos = p-PyUnicode_AS_UNICODE(v);
3777 for (x = 0, i = 0; i < count; ++i, ++s) {
3778 c = (unsigned char)*s;
3779 if (!ISXDIGIT(c)) {
3780 endinpos = s-starts;
3781 if (unicode_decode_call_errorhandler(
3782 errors, &errorHandler,
3783 "rawunicodeescape", "truncated \\uXXXX",
3784 &starts, &end, &startinpos, &endinpos, &exc, &s,
3785 &v, &outpos, &p))
3786 goto onError;
3787 goto nextByte;
3788 }
3789 x = (x<<4) & ~0xF;
3790 if (c >= '0' && c <= '9')
3791 x += c - '0';
3792 else if (c >= 'a' && c <= 'f')
3793 x += 10 + c - 'a';
3794 else
3795 x += 10 + c - 'A';
3796 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003797 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003798 /* UCS-2 character */
3799 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003800 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003801 /* UCS-4 character. Either store directly, or as
3802 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003803#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003805#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003806 x -= 0x10000L;
3807 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3808 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003809#endif
3810 } else {
3811 endinpos = s-starts;
3812 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003813 if (unicode_decode_call_errorhandler(
3814 errors, &errorHandler,
3815 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003816 &starts, &end, &startinpos, &endinpos, &exc, &s,
3817 &v, &outpos, &p))
3818 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003819 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003820 nextByte:
3821 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003823 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 Py_XDECREF(errorHandler);
3826 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003828
Benjamin Peterson29060642009-01-31 22:14:21 +00003829 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831 Py_XDECREF(errorHandler);
3832 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 return NULL;
3834}
3835
3836PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003839 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 char *p;
3841 char *q;
3842
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003843#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003844 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003845#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003846 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003847#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003848
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003849 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003850 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003851
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003852 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 if (repr == NULL)
3854 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003855 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003856 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003858 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 while (size-- > 0) {
3860 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003861#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003862 /* Map 32-bit characters to '\Uxxxxxxxx' */
3863 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003864 *p++ = '\\';
3865 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003866 *p++ = hexdigits[(ch >> 28) & 0xf];
3867 *p++ = hexdigits[(ch >> 24) & 0xf];
3868 *p++ = hexdigits[(ch >> 20) & 0xf];
3869 *p++ = hexdigits[(ch >> 16) & 0xf];
3870 *p++ = hexdigits[(ch >> 12) & 0xf];
3871 *p++ = hexdigits[(ch >> 8) & 0xf];
3872 *p++ = hexdigits[(ch >> 4) & 0xf];
3873 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003874 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003875 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003876#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003877 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3878 if (ch >= 0xD800 && ch < 0xDC00) {
3879 Py_UNICODE ch2;
3880 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003881
Benjamin Peterson29060642009-01-31 22:14:21 +00003882 ch2 = *s++;
3883 size--;
3884 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3885 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3886 *p++ = '\\';
3887 *p++ = 'U';
3888 *p++ = hexdigits[(ucs >> 28) & 0xf];
3889 *p++ = hexdigits[(ucs >> 24) & 0xf];
3890 *p++ = hexdigits[(ucs >> 20) & 0xf];
3891 *p++ = hexdigits[(ucs >> 16) & 0xf];
3892 *p++ = hexdigits[(ucs >> 12) & 0xf];
3893 *p++ = hexdigits[(ucs >> 8) & 0xf];
3894 *p++ = hexdigits[(ucs >> 4) & 0xf];
3895 *p++ = hexdigits[ucs & 0xf];
3896 continue;
3897 }
3898 /* Fall through: isolated surrogates are copied as-is */
3899 s--;
3900 size++;
3901 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003902#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003903 /* Map 16-bit characters to '\uxxxx' */
3904 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 *p++ = '\\';
3906 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003907 *p++ = hexdigits[(ch >> 12) & 0xf];
3908 *p++ = hexdigits[(ch >> 8) & 0xf];
3909 *p++ = hexdigits[(ch >> 4) & 0xf];
3910 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 /* Copy everything else as-is */
3913 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914 *p++ = (char) ch;
3915 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003916 size = p - q;
3917
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003918 assert(size > 0);
3919 if (_PyBytes_Resize(&repr, size) < 0)
3920 return NULL;
3921 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922}
3923
3924PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3925{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003926 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003928 PyErr_BadArgument();
3929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003931 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3932 PyUnicode_GET_SIZE(unicode));
3933
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003934 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935}
3936
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003937/* --- Unicode Internal Codec ------------------------------------------- */
3938
3939PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003940 Py_ssize_t size,
3941 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003942{
3943 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003944 Py_ssize_t startinpos;
3945 Py_ssize_t endinpos;
3946 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003947 PyUnicodeObject *v;
3948 Py_UNICODE *p;
3949 const char *end;
3950 const char *reason;
3951 PyObject *errorHandler = NULL;
3952 PyObject *exc = NULL;
3953
Neal Norwitzd43069c2006-01-08 01:12:10 +00003954#ifdef Py_UNICODE_WIDE
3955 Py_UNICODE unimax = PyUnicode_GetMax();
3956#endif
3957
Thomas Wouters89f507f2006-12-13 04:49:30 +00003958 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003959 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3960 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003961 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003962 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003964 p = PyUnicode_AS_UNICODE(v);
3965 end = s + size;
3966
3967 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003968 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003969 /* We have to sanity check the raw data, otherwise doom looms for
3970 some malformed UCS-4 data. */
3971 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003972#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003973 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003974#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003975 end-s < Py_UNICODE_SIZE
3976 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003978 startinpos = s - starts;
3979 if (end-s < Py_UNICODE_SIZE) {
3980 endinpos = end-starts;
3981 reason = "truncated input";
3982 }
3983 else {
3984 endinpos = s - starts + Py_UNICODE_SIZE;
3985 reason = "illegal code point (> 0x10FFFF)";
3986 }
3987 outpos = p - PyUnicode_AS_UNICODE(v);
3988 if (unicode_decode_call_errorhandler(
3989 errors, &errorHandler,
3990 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003991 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003992 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003993 goto onError;
3994 }
3995 }
3996 else {
3997 p++;
3998 s += Py_UNICODE_SIZE;
3999 }
4000 }
4001
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004002 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004003 goto onError;
4004 Py_XDECREF(errorHandler);
4005 Py_XDECREF(exc);
4006 return (PyObject *)v;
4007
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004009 Py_XDECREF(v);
4010 Py_XDECREF(errorHandler);
4011 Py_XDECREF(exc);
4012 return NULL;
4013}
4014
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015/* --- Latin-1 Codec ------------------------------------------------------ */
4016
4017PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004018 Py_ssize_t size,
4019 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020{
4021 PyUnicodeObject *v;
4022 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004023 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004024
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004026 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004027 Py_UNICODE r = *(unsigned char*)s;
4028 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004029 }
4030
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 v = _PyUnicode_New(size);
4032 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004033 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004037 e = s + size;
4038 /* Unrolling the copy makes it much faster by reducing the looping
4039 overhead. This is similar to what many memcpy() implementations do. */
4040 unrolled_end = e - 4;
4041 while (s < unrolled_end) {
4042 p[0] = (unsigned char) s[0];
4043 p[1] = (unsigned char) s[1];
4044 p[2] = (unsigned char) s[2];
4045 p[3] = (unsigned char) s[3];
4046 s += 4;
4047 p += 4;
4048 }
4049 while (s < e)
4050 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004052
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054 Py_XDECREF(v);
4055 return NULL;
4056}
4057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058/* create or adjust a UnicodeEncodeError */
4059static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 const char *encoding,
4061 const Py_UNICODE *unicode, Py_ssize_t size,
4062 Py_ssize_t startpos, Py_ssize_t endpos,
4063 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 *exceptionObject = PyUnicodeEncodeError_Create(
4067 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 }
4069 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4071 goto onError;
4072 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4073 goto onError;
4074 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4075 goto onError;
4076 return;
4077 onError:
4078 Py_DECREF(*exceptionObject);
4079 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 }
4081}
4082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083/* raises a UnicodeEncodeError */
4084static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 const char *encoding,
4086 const Py_UNICODE *unicode, Py_ssize_t size,
4087 Py_ssize_t startpos, Py_ssize_t endpos,
4088 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089{
4090 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004093 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094}
4095
4096/* error handling callback helper:
4097 build arguments, call the callback and check the arguments,
4098 put the result into newpos and return the replacement string, which
4099 has to be freed by the caller */
4100static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 PyObject **errorHandler,
4102 const char *encoding, const char *reason,
4103 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4104 Py_ssize_t startpos, Py_ssize_t endpos,
4105 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004107 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108
4109 PyObject *restuple;
4110 PyObject *resunicode;
4111
4112 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 }
4117
4118 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122
4123 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004128 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 Py_DECREF(restuple);
4130 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004132 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 &resunicode, newpos)) {
4134 Py_DECREF(restuple);
4135 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004137 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4138 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4139 Py_DECREF(restuple);
4140 return NULL;
4141 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004143 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004144 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4146 Py_DECREF(restuple);
4147 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004148 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 Py_INCREF(resunicode);
4150 Py_DECREF(restuple);
4151 return resunicode;
4152}
4153
4154static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 Py_ssize_t size,
4156 const char *errors,
4157 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158{
4159 /* output object */
4160 PyObject *res;
4161 /* pointers to the beginning and end+1 of input */
4162 const Py_UNICODE *startp = p;
4163 const Py_UNICODE *endp = p + size;
4164 /* pointer to the beginning of the unencodable characters */
4165 /* const Py_UNICODE *badp = NULL; */
4166 /* pointer into the output */
4167 char *str;
4168 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004169 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004170 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4171 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 PyObject *errorHandler = NULL;
4173 PyObject *exc = NULL;
4174 /* the following variable is used for caching string comparisons
4175 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4176 int known_errorHandler = -1;
4177
4178 /* allocate enough for a simple encoding without
4179 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004180 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004181 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004182 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004184 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004185 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 ressize = size;
4187
4188 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 /* can we encode this? */
4192 if (c<limit) {
4193 /* no overflow check, because we know that the space is enough */
4194 *str++ = (char)c;
4195 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 else {
4198 Py_ssize_t unicodepos = p-startp;
4199 Py_ssize_t requiredsize;
4200 PyObject *repunicode;
4201 Py_ssize_t repsize;
4202 Py_ssize_t newpos;
4203 Py_ssize_t respos;
4204 Py_UNICODE *uni2;
4205 /* startpos for collecting unencodable chars */
4206 const Py_UNICODE *collstart = p;
4207 const Py_UNICODE *collend = p;
4208 /* find all unecodable characters */
4209 while ((collend < endp) && ((*collend)>=limit))
4210 ++collend;
4211 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4212 if (known_errorHandler==-1) {
4213 if ((errors==NULL) || (!strcmp(errors, "strict")))
4214 known_errorHandler = 1;
4215 else if (!strcmp(errors, "replace"))
4216 known_errorHandler = 2;
4217 else if (!strcmp(errors, "ignore"))
4218 known_errorHandler = 3;
4219 else if (!strcmp(errors, "xmlcharrefreplace"))
4220 known_errorHandler = 4;
4221 else
4222 known_errorHandler = 0;
4223 }
4224 switch (known_errorHandler) {
4225 case 1: /* strict */
4226 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4227 goto onError;
4228 case 2: /* replace */
4229 while (collstart++<collend)
4230 *str++ = '?'; /* fall through */
4231 case 3: /* ignore */
4232 p = collend;
4233 break;
4234 case 4: /* xmlcharrefreplace */
4235 respos = str - PyBytes_AS_STRING(res);
4236 /* determine replacement size (temporarily (mis)uses p) */
4237 for (p = collstart, repsize = 0; p < collend; ++p) {
4238 if (*p<10)
4239 repsize += 2+1+1;
4240 else if (*p<100)
4241 repsize += 2+2+1;
4242 else if (*p<1000)
4243 repsize += 2+3+1;
4244 else if (*p<10000)
4245 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004246#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 else
4248 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004249#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 else if (*p<100000)
4251 repsize += 2+5+1;
4252 else if (*p<1000000)
4253 repsize += 2+6+1;
4254 else
4255 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004256#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 }
4258 requiredsize = respos+repsize+(endp-collend);
4259 if (requiredsize > ressize) {
4260 if (requiredsize<2*ressize)
4261 requiredsize = 2*ressize;
4262 if (_PyBytes_Resize(&res, requiredsize))
4263 goto onError;
4264 str = PyBytes_AS_STRING(res) + respos;
4265 ressize = requiredsize;
4266 }
4267 /* generate replacement (temporarily (mis)uses p) */
4268 for (p = collstart; p < collend; ++p) {
4269 str += sprintf(str, "&#%d;", (int)*p);
4270 }
4271 p = collend;
4272 break;
4273 default:
4274 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4275 encoding, reason, startp, size, &exc,
4276 collstart-startp, collend-startp, &newpos);
4277 if (repunicode == NULL)
4278 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004279 if (PyBytes_Check(repunicode)) {
4280 /* Directly copy bytes result to output. */
4281 repsize = PyBytes_Size(repunicode);
4282 if (repsize > 1) {
4283 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004284 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004285 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4286 Py_DECREF(repunicode);
4287 goto onError;
4288 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004289 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004290 ressize += repsize-1;
4291 }
4292 memcpy(str, PyBytes_AsString(repunicode), repsize);
4293 str += repsize;
4294 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004295 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004296 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004297 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004298 /* need more space? (at least enough for what we
4299 have+the replacement+the rest of the string, so
4300 we won't have to check space for encodable characters) */
4301 respos = str - PyBytes_AS_STRING(res);
4302 repsize = PyUnicode_GET_SIZE(repunicode);
4303 requiredsize = respos+repsize+(endp-collend);
4304 if (requiredsize > ressize) {
4305 if (requiredsize<2*ressize)
4306 requiredsize = 2*ressize;
4307 if (_PyBytes_Resize(&res, requiredsize)) {
4308 Py_DECREF(repunicode);
4309 goto onError;
4310 }
4311 str = PyBytes_AS_STRING(res) + respos;
4312 ressize = requiredsize;
4313 }
4314 /* check if there is anything unencodable in the replacement
4315 and copy it to the output */
4316 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4317 c = *uni2;
4318 if (c >= limit) {
4319 raise_encode_exception(&exc, encoding, startp, size,
4320 unicodepos, unicodepos+1, reason);
4321 Py_DECREF(repunicode);
4322 goto onError;
4323 }
4324 *str = (char)c;
4325 }
4326 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004327 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004328 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004329 }
4330 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004331 /* Resize if we allocated to much */
4332 size = str - PyBytes_AS_STRING(res);
4333 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004334 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004335 if (_PyBytes_Resize(&res, size) < 0)
4336 goto onError;
4337 }
4338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339 Py_XDECREF(errorHandler);
4340 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004341 return res;
4342
4343 onError:
4344 Py_XDECREF(res);
4345 Py_XDECREF(errorHandler);
4346 Py_XDECREF(exc);
4347 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348}
4349
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004351 Py_ssize_t size,
4352 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355}
4356
4357PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4358{
4359 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 PyErr_BadArgument();
4361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362 }
4363 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004364 PyUnicode_GET_SIZE(unicode),
4365 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366}
4367
4368/* --- 7-bit ASCII Codec -------------------------------------------------- */
4369
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 Py_ssize_t size,
4372 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375 PyUnicodeObject *v;
4376 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004377 Py_ssize_t startinpos;
4378 Py_ssize_t endinpos;
4379 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 const char *e;
4381 PyObject *errorHandler = NULL;
4382 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004383
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004385 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 Py_UNICODE r = *(unsigned char*)s;
4387 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004388 }
Tim Petersced69f82003-09-16 20:30:58 +00004389
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 v = _PyUnicode_New(size);
4391 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 e = s + size;
4397 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 register unsigned char c = (unsigned char)*s;
4399 if (c < 128) {
4400 *p++ = c;
4401 ++s;
4402 }
4403 else {
4404 startinpos = s-starts;
4405 endinpos = startinpos + 1;
4406 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4407 if (unicode_decode_call_errorhandler(
4408 errors, &errorHandler,
4409 "ascii", "ordinal not in range(128)",
4410 &starts, &e, &startinpos, &endinpos, &exc, &s,
4411 &v, &outpos, &p))
4412 goto onError;
4413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004415 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4417 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 Py_XDECREF(errorHandler);
4419 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004421
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 Py_XDECREF(errorHandler);
4425 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 return NULL;
4427}
4428
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 Py_ssize_t size,
4431 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434}
4435
4436PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4437{
4438 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 PyErr_BadArgument();
4440 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 }
4442 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 PyUnicode_GET_SIZE(unicode),
4444 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445}
4446
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004447#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004448
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004449/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004450
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004451#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004452#define NEED_RETRY
4453#endif
4454
4455/* XXX This code is limited to "true" double-byte encodings, as
4456 a) it assumes an incomplete character consists of a single byte, and
4457 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004459
4460static int is_dbcs_lead_byte(const char *s, int offset)
4461{
4462 const char *curr = s + offset;
4463
4464 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 const char *prev = CharPrev(s, curr);
4466 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004467 }
4468 return 0;
4469}
4470
4471/*
4472 * Decode MBCS string into unicode object. If 'final' is set, converts
4473 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4474 */
4475static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 const char *s, /* MBCS string */
4477 int size, /* sizeof MBCS string */
4478 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004479{
4480 Py_UNICODE *p;
4481 Py_ssize_t n = 0;
4482 int usize = 0;
4483
4484 assert(size >= 0);
4485
4486 /* Skip trailing lead-byte unless 'final' is set */
4487 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004489
4490 /* First get the size of the result */
4491 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4493 if (usize == 0) {
4494 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4495 return -1;
4496 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004497 }
4498
4499 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 /* Create unicode object */
4501 *v = _PyUnicode_New(usize);
4502 if (*v == NULL)
4503 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004504 }
4505 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 /* Extend unicode object */
4507 n = PyUnicode_GET_SIZE(*v);
4508 if (_PyUnicode_Resize(v, n + usize) < 0)
4509 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004510 }
4511
4512 /* Do the conversion */
4513 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 p = PyUnicode_AS_UNICODE(*v) + n;
4515 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4516 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4517 return -1;
4518 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004519 }
4520
4521 return size;
4522}
4523
4524PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 Py_ssize_t size,
4526 const char *errors,
4527 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004528{
4529 PyUnicodeObject *v = NULL;
4530 int done;
4531
4532 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004534
4535#ifdef NEED_RETRY
4536 retry:
4537 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004539 else
4540#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004542
4543 if (done < 0) {
4544 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004546 }
4547
4548 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004550
4551#ifdef NEED_RETRY
4552 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 s += done;
4554 size -= done;
4555 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004556 }
4557#endif
4558
4559 return (PyObject *)v;
4560}
4561
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004562PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 Py_ssize_t size,
4564 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004565{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004566 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4567}
4568
4569/*
4570 * Convert unicode into string object (MBCS).
4571 * Returns 0 if succeed, -1 otherwise.
4572 */
4573static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 const Py_UNICODE *p, /* unicode */
4575 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004576{
4577 int mbcssize = 0;
4578 Py_ssize_t n = 0;
4579
4580 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004581
4582 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004583 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4585 if (mbcssize == 0) {
4586 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4587 return -1;
4588 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004589 }
4590
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004591 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 /* Create string object */
4593 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4594 if (*repr == NULL)
4595 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004596 }
4597 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 /* Extend string object */
4599 n = PyBytes_Size(*repr);
4600 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4601 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004602 }
4603
4604 /* Do the conversion */
4605 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 char *s = PyBytes_AS_STRING(*repr) + n;
4607 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4608 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4609 return -1;
4610 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004611 }
4612
4613 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004614}
4615
4616PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 Py_ssize_t size,
4618 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004619{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004620 PyObject *repr = NULL;
4621 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004622
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004623#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004625 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004626 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004627 else
4628#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004630
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004631 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004632 Py_XDECREF(repr);
4633 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004634 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004635
4636#ifdef NEED_RETRY
4637 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 p += INT_MAX;
4639 size -= INT_MAX;
4640 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004641 }
4642#endif
4643
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004644 return repr;
4645}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004646
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004647PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4648{
4649 if (!PyUnicode_Check(unicode)) {
4650 PyErr_BadArgument();
4651 return NULL;
4652 }
4653 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 PyUnicode_GET_SIZE(unicode),
4655 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004656}
4657
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004658#undef NEED_RETRY
4659
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004660#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004661
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662/* --- Character Mapping Codec -------------------------------------------- */
4663
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 Py_ssize_t size,
4666 PyObject *mapping,
4667 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004670 Py_ssize_t startinpos;
4671 Py_ssize_t endinpos;
4672 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 PyUnicodeObject *v;
4675 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004676 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 PyObject *errorHandler = NULL;
4678 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004679 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004680 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004681
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 /* Default to Latin-1 */
4683 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685
4686 v = _PyUnicode_New(size);
4687 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004693 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 mapstring = PyUnicode_AS_UNICODE(mapping);
4695 maplen = PyUnicode_GET_SIZE(mapping);
4696 while (s < e) {
4697 unsigned char ch = *s;
4698 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 if (ch < maplen)
4701 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 if (x == 0xfffe) {
4704 /* undefined mapping */
4705 outpos = p-PyUnicode_AS_UNICODE(v);
4706 startinpos = s-starts;
4707 endinpos = startinpos+1;
4708 if (unicode_decode_call_errorhandler(
4709 errors, &errorHandler,
4710 "charmap", "character maps to <undefined>",
4711 &starts, &e, &startinpos, &endinpos, &exc, &s,
4712 &v, &outpos, &p)) {
4713 goto onError;
4714 }
4715 continue;
4716 }
4717 *p++ = x;
4718 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004719 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004720 }
4721 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 while (s < e) {
4723 unsigned char ch = *s;
4724 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004725
Benjamin Peterson29060642009-01-31 22:14:21 +00004726 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4727 w = PyLong_FromLong((long)ch);
4728 if (w == NULL)
4729 goto onError;
4730 x = PyObject_GetItem(mapping, w);
4731 Py_DECREF(w);
4732 if (x == NULL) {
4733 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4734 /* No mapping found means: mapping is undefined. */
4735 PyErr_Clear();
4736 x = Py_None;
4737 Py_INCREF(x);
4738 } else
4739 goto onError;
4740 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004741
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 /* Apply mapping */
4743 if (PyLong_Check(x)) {
4744 long value = PyLong_AS_LONG(x);
4745 if (value < 0 || value > 65535) {
4746 PyErr_SetString(PyExc_TypeError,
4747 "character mapping must be in range(65536)");
4748 Py_DECREF(x);
4749 goto onError;
4750 }
4751 *p++ = (Py_UNICODE)value;
4752 }
4753 else if (x == Py_None) {
4754 /* undefined mapping */
4755 outpos = p-PyUnicode_AS_UNICODE(v);
4756 startinpos = s-starts;
4757 endinpos = startinpos+1;
4758 if (unicode_decode_call_errorhandler(
4759 errors, &errorHandler,
4760 "charmap", "character maps to <undefined>",
4761 &starts, &e, &startinpos, &endinpos, &exc, &s,
4762 &v, &outpos, &p)) {
4763 Py_DECREF(x);
4764 goto onError;
4765 }
4766 Py_DECREF(x);
4767 continue;
4768 }
4769 else if (PyUnicode_Check(x)) {
4770 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004771
Benjamin Peterson29060642009-01-31 22:14:21 +00004772 if (targetsize == 1)
4773 /* 1-1 mapping */
4774 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004775
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 else if (targetsize > 1) {
4777 /* 1-n mapping */
4778 if (targetsize > extrachars) {
4779 /* resize first */
4780 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4781 Py_ssize_t needed = (targetsize - extrachars) + \
4782 (targetsize << 2);
4783 extrachars += needed;
4784 /* XXX overflow detection missing */
4785 if (_PyUnicode_Resize(&v,
4786 PyUnicode_GET_SIZE(v) + needed) < 0) {
4787 Py_DECREF(x);
4788 goto onError;
4789 }
4790 p = PyUnicode_AS_UNICODE(v) + oldpos;
4791 }
4792 Py_UNICODE_COPY(p,
4793 PyUnicode_AS_UNICODE(x),
4794 targetsize);
4795 p += targetsize;
4796 extrachars -= targetsize;
4797 }
4798 /* 1-0 mapping: skip the character */
4799 }
4800 else {
4801 /* wrong return value */
4802 PyErr_SetString(PyExc_TypeError,
4803 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004804 Py_DECREF(x);
4805 goto onError;
4806 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004807 Py_DECREF(x);
4808 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 }
4811 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4813 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 Py_XDECREF(errorHandler);
4815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004817
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819 Py_XDECREF(errorHandler);
4820 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 Py_XDECREF(v);
4822 return NULL;
4823}
4824
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004825/* Charmap encoding: the lookup table */
4826
4827struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 PyObject_HEAD
4829 unsigned char level1[32];
4830 int count2, count3;
4831 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004832};
4833
4834static PyObject*
4835encoding_map_size(PyObject *obj, PyObject* args)
4836{
4837 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004838 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004840}
4841
4842static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004843 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 PyDoc_STR("Return the size (in bytes) of this object") },
4845 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004846};
4847
4848static void
4849encoding_map_dealloc(PyObject* o)
4850{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004851 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004852}
4853
4854static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004855 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004856 "EncodingMap", /*tp_name*/
4857 sizeof(struct encoding_map), /*tp_basicsize*/
4858 0, /*tp_itemsize*/
4859 /* methods */
4860 encoding_map_dealloc, /*tp_dealloc*/
4861 0, /*tp_print*/
4862 0, /*tp_getattr*/
4863 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004864 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004865 0, /*tp_repr*/
4866 0, /*tp_as_number*/
4867 0, /*tp_as_sequence*/
4868 0, /*tp_as_mapping*/
4869 0, /*tp_hash*/
4870 0, /*tp_call*/
4871 0, /*tp_str*/
4872 0, /*tp_getattro*/
4873 0, /*tp_setattro*/
4874 0, /*tp_as_buffer*/
4875 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4876 0, /*tp_doc*/
4877 0, /*tp_traverse*/
4878 0, /*tp_clear*/
4879 0, /*tp_richcompare*/
4880 0, /*tp_weaklistoffset*/
4881 0, /*tp_iter*/
4882 0, /*tp_iternext*/
4883 encoding_map_methods, /*tp_methods*/
4884 0, /*tp_members*/
4885 0, /*tp_getset*/
4886 0, /*tp_base*/
4887 0, /*tp_dict*/
4888 0, /*tp_descr_get*/
4889 0, /*tp_descr_set*/
4890 0, /*tp_dictoffset*/
4891 0, /*tp_init*/
4892 0, /*tp_alloc*/
4893 0, /*tp_new*/
4894 0, /*tp_free*/
4895 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004896};
4897
4898PyObject*
4899PyUnicode_BuildEncodingMap(PyObject* string)
4900{
4901 Py_UNICODE *decode;
4902 PyObject *result;
4903 struct encoding_map *mresult;
4904 int i;
4905 int need_dict = 0;
4906 unsigned char level1[32];
4907 unsigned char level2[512];
4908 unsigned char *mlevel1, *mlevel2, *mlevel3;
4909 int count2 = 0, count3 = 0;
4910
4911 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4912 PyErr_BadArgument();
4913 return NULL;
4914 }
4915 decode = PyUnicode_AS_UNICODE(string);
4916 memset(level1, 0xFF, sizeof level1);
4917 memset(level2, 0xFF, sizeof level2);
4918
4919 /* If there isn't a one-to-one mapping of NULL to \0,
4920 or if there are non-BMP characters, we need to use
4921 a mapping dictionary. */
4922 if (decode[0] != 0)
4923 need_dict = 1;
4924 for (i = 1; i < 256; i++) {
4925 int l1, l2;
4926 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004927#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004928 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004929#endif
4930 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004931 need_dict = 1;
4932 break;
4933 }
4934 if (decode[i] == 0xFFFE)
4935 /* unmapped character */
4936 continue;
4937 l1 = decode[i] >> 11;
4938 l2 = decode[i] >> 7;
4939 if (level1[l1] == 0xFF)
4940 level1[l1] = count2++;
4941 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004942 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004943 }
4944
4945 if (count2 >= 0xFF || count3 >= 0xFF)
4946 need_dict = 1;
4947
4948 if (need_dict) {
4949 PyObject *result = PyDict_New();
4950 PyObject *key, *value;
4951 if (!result)
4952 return NULL;
4953 for (i = 0; i < 256; i++) {
4954 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004955 key = PyLong_FromLong(decode[i]);
4956 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004957 if (!key || !value)
4958 goto failed1;
4959 if (PyDict_SetItem(result, key, value) == -1)
4960 goto failed1;
4961 Py_DECREF(key);
4962 Py_DECREF(value);
4963 }
4964 return result;
4965 failed1:
4966 Py_XDECREF(key);
4967 Py_XDECREF(value);
4968 Py_DECREF(result);
4969 return NULL;
4970 }
4971
4972 /* Create a three-level trie */
4973 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4974 16*count2 + 128*count3 - 1);
4975 if (!result)
4976 return PyErr_NoMemory();
4977 PyObject_Init(result, &EncodingMapType);
4978 mresult = (struct encoding_map*)result;
4979 mresult->count2 = count2;
4980 mresult->count3 = count3;
4981 mlevel1 = mresult->level1;
4982 mlevel2 = mresult->level23;
4983 mlevel3 = mresult->level23 + 16*count2;
4984 memcpy(mlevel1, level1, 32);
4985 memset(mlevel2, 0xFF, 16*count2);
4986 memset(mlevel3, 0, 128*count3);
4987 count3 = 0;
4988 for (i = 1; i < 256; i++) {
4989 int o1, o2, o3, i2, i3;
4990 if (decode[i] == 0xFFFE)
4991 /* unmapped character */
4992 continue;
4993 o1 = decode[i]>>11;
4994 o2 = (decode[i]>>7) & 0xF;
4995 i2 = 16*mlevel1[o1] + o2;
4996 if (mlevel2[i2] == 0xFF)
4997 mlevel2[i2] = count3++;
4998 o3 = decode[i] & 0x7F;
4999 i3 = 128*mlevel2[i2] + o3;
5000 mlevel3[i3] = i;
5001 }
5002 return result;
5003}
5004
5005static int
5006encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5007{
5008 struct encoding_map *map = (struct encoding_map*)mapping;
5009 int l1 = c>>11;
5010 int l2 = (c>>7) & 0xF;
5011 int l3 = c & 0x7F;
5012 int i;
5013
5014#ifdef Py_UNICODE_WIDE
5015 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005017 }
5018#endif
5019 if (c == 0)
5020 return 0;
5021 /* level 1*/
5022 i = map->level1[l1];
5023 if (i == 0xFF) {
5024 return -1;
5025 }
5026 /* level 2*/
5027 i = map->level23[16*i+l2];
5028 if (i == 0xFF) {
5029 return -1;
5030 }
5031 /* level 3 */
5032 i = map->level23[16*map->count2 + 128*i + l3];
5033 if (i == 0) {
5034 return -1;
5035 }
5036 return i;
5037}
5038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005039/* Lookup the character ch in the mapping. If the character
5040 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005041 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005042static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043{
Christian Heimes217cfd12007-12-02 14:31:20 +00005044 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 PyObject *x;
5046
5047 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005049 x = PyObject_GetItem(mapping, w);
5050 Py_DECREF(w);
5051 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5053 /* No mapping found means: mapping is undefined. */
5054 PyErr_Clear();
5055 x = Py_None;
5056 Py_INCREF(x);
5057 return x;
5058 } else
5059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005061 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005063 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 long value = PyLong_AS_LONG(x);
5065 if (value < 0 || value > 255) {
5066 PyErr_SetString(PyExc_TypeError,
5067 "character mapping must be in range(256)");
5068 Py_DECREF(x);
5069 return NULL;
5070 }
5071 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005073 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 /* wrong return value */
5077 PyErr_Format(PyExc_TypeError,
5078 "character mapping must return integer, bytes or None, not %.400s",
5079 x->ob_type->tp_name);
5080 Py_DECREF(x);
5081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 }
5083}
5084
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005085static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005086charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005087{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005088 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5089 /* exponentially overallocate to minimize reallocations */
5090 if (requiredsize < 2*outsize)
5091 requiredsize = 2*outsize;
5092 if (_PyBytes_Resize(outobj, requiredsize))
5093 return -1;
5094 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005095}
5096
Benjamin Peterson14339b62009-01-31 16:36:08 +00005097typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005099}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005100/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005101 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102 space is available. Return a new reference to the object that
5103 was put in the output buffer, or Py_None, if the mapping was undefined
5104 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005105 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005106static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005107charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005110 PyObject *rep;
5111 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005112 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113
Christian Heimes90aa7642007-12-19 02:45:37 +00005114 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005115 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005117 if (res == -1)
5118 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 if (outsize<requiredsize)
5120 if (charmapencode_resize(outobj, outpos, requiredsize))
5121 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005122 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 outstart[(*outpos)++] = (char)res;
5124 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005125 }
5126
5127 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005130 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 Py_DECREF(rep);
5132 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005133 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 if (PyLong_Check(rep)) {
5135 Py_ssize_t requiredsize = *outpos+1;
5136 if (outsize<requiredsize)
5137 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5138 Py_DECREF(rep);
5139 return enc_EXCEPTION;
5140 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005141 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005143 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 else {
5145 const char *repchars = PyBytes_AS_STRING(rep);
5146 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5147 Py_ssize_t requiredsize = *outpos+repsize;
5148 if (outsize<requiredsize)
5149 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5150 Py_DECREF(rep);
5151 return enc_EXCEPTION;
5152 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005153 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 memcpy(outstart + *outpos, repchars, repsize);
5155 *outpos += repsize;
5156 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005157 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005158 Py_DECREF(rep);
5159 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160}
5161
5162/* handle an error in PyUnicode_EncodeCharmap
5163 Return 0 on success, -1 on error */
5164static
5165int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005166 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005167 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005168 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005169 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005170{
5171 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005172 Py_ssize_t repsize;
5173 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174 Py_UNICODE *uni2;
5175 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005176 Py_ssize_t collstartpos = *inpos;
5177 Py_ssize_t collendpos = *inpos+1;
5178 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 char *encoding = "charmap";
5180 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005181 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183 /* find all unencodable characters */
5184 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005185 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005186 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 int res = encoding_map_lookup(p[collendpos], mapping);
5188 if (res != -1)
5189 break;
5190 ++collendpos;
5191 continue;
5192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005193
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 rep = charmapencode_lookup(p[collendpos], mapping);
5195 if (rep==NULL)
5196 return -1;
5197 else if (rep!=Py_None) {
5198 Py_DECREF(rep);
5199 break;
5200 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005201 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005202 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203 }
5204 /* cache callback name lookup
5205 * (if not done yet, i.e. it's the first error) */
5206 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 if ((errors==NULL) || (!strcmp(errors, "strict")))
5208 *known_errorHandler = 1;
5209 else if (!strcmp(errors, "replace"))
5210 *known_errorHandler = 2;
5211 else if (!strcmp(errors, "ignore"))
5212 *known_errorHandler = 3;
5213 else if (!strcmp(errors, "xmlcharrefreplace"))
5214 *known_errorHandler = 4;
5215 else
5216 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 }
5218 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005219 case 1: /* strict */
5220 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5221 return -1;
5222 case 2: /* replace */
5223 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 x = charmapencode_output('?', mapping, res, respos);
5225 if (x==enc_EXCEPTION) {
5226 return -1;
5227 }
5228 else if (x==enc_FAILED) {
5229 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5230 return -1;
5231 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005232 }
5233 /* fall through */
5234 case 3: /* ignore */
5235 *inpos = collendpos;
5236 break;
5237 case 4: /* xmlcharrefreplace */
5238 /* generate replacement (temporarily (mis)uses p) */
5239 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 char buffer[2+29+1+1];
5241 char *cp;
5242 sprintf(buffer, "&#%d;", (int)p[collpos]);
5243 for (cp = buffer; *cp; ++cp) {
5244 x = charmapencode_output(*cp, mapping, res, respos);
5245 if (x==enc_EXCEPTION)
5246 return -1;
5247 else if (x==enc_FAILED) {
5248 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5249 return -1;
5250 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005251 }
5252 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005253 *inpos = collendpos;
5254 break;
5255 default:
5256 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 encoding, reason, p, size, exceptionObject,
5258 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005259 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005261 if (PyBytes_Check(repunicode)) {
5262 /* Directly copy bytes result to output. */
5263 Py_ssize_t outsize = PyBytes_Size(*res);
5264 Py_ssize_t requiredsize;
5265 repsize = PyBytes_Size(repunicode);
5266 requiredsize = *respos + repsize;
5267 if (requiredsize > outsize)
5268 /* Make room for all additional bytes. */
5269 if (charmapencode_resize(res, respos, requiredsize)) {
5270 Py_DECREF(repunicode);
5271 return -1;
5272 }
5273 memcpy(PyBytes_AsString(*res) + *respos,
5274 PyBytes_AsString(repunicode), repsize);
5275 *respos += repsize;
5276 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005277 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005278 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005279 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005280 /* generate replacement */
5281 repsize = PyUnicode_GET_SIZE(repunicode);
5282 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 x = charmapencode_output(*uni2, mapping, res, respos);
5284 if (x==enc_EXCEPTION) {
5285 return -1;
5286 }
5287 else if (x==enc_FAILED) {
5288 Py_DECREF(repunicode);
5289 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5290 return -1;
5291 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005292 }
5293 *inpos = newpos;
5294 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005295 }
5296 return 0;
5297}
5298
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 Py_ssize_t size,
5301 PyObject *mapping,
5302 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 /* output object */
5305 PyObject *res = NULL;
5306 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005307 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005308 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005309 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005310 PyObject *errorHandler = NULL;
5311 PyObject *exc = NULL;
5312 /* the following variable is used for caching string comparisons
5313 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5314 * 3=ignore, 4=xmlcharrefreplace */
5315 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316
5317 /* Default to Latin-1 */
5318 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005321 /* allocate enough for a simple encoding without
5322 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005323 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 if (res == NULL)
5325 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005326 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005329 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 /* try to encode it */
5331 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5332 if (x==enc_EXCEPTION) /* error */
5333 goto onError;
5334 if (x==enc_FAILED) { /* unencodable character */
5335 if (charmap_encoding_error(p, size, &inpos, mapping,
5336 &exc,
5337 &known_errorHandler, &errorHandler, errors,
5338 &res, &respos)) {
5339 goto onError;
5340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005341 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 else
5343 /* done with this character => adjust input position */
5344 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005348 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005349 if (_PyBytes_Resize(&res, respos) < 0)
5350 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352 Py_XDECREF(exc);
5353 Py_XDECREF(errorHandler);
5354 return res;
5355
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005357 Py_XDECREF(res);
5358 Py_XDECREF(exc);
5359 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 return NULL;
5361}
5362
5363PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365{
5366 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 PyErr_BadArgument();
5368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 }
5370 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 PyUnicode_GET_SIZE(unicode),
5372 mapping,
5373 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374}
5375
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376/* create or adjust a UnicodeTranslateError */
5377static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 const Py_UNICODE *unicode, Py_ssize_t size,
5379 Py_ssize_t startpos, Py_ssize_t endpos,
5380 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005382 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005383 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 }
5386 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5388 goto onError;
5389 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5390 goto onError;
5391 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5392 goto onError;
5393 return;
5394 onError:
5395 Py_DECREF(*exceptionObject);
5396 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 }
5398}
5399
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005400/* raises a UnicodeTranslateError */
5401static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 const Py_UNICODE *unicode, Py_ssize_t size,
5403 Py_ssize_t startpos, Py_ssize_t endpos,
5404 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005405{
5406 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005408 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410}
5411
5412/* error handling callback helper:
5413 build arguments, call the callback and check the arguments,
5414 put the result into newpos and return the replacement string, which
5415 has to be freed by the caller */
5416static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 PyObject **errorHandler,
5418 const char *reason,
5419 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5420 Py_ssize_t startpos, Py_ssize_t endpos,
5421 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005422{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005423 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005424
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005425 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005426 PyObject *restuple;
5427 PyObject *resunicode;
5428
5429 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005433 }
5434
5435 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439
5440 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005445 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 Py_DECREF(restuple);
5447 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005448 }
5449 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 &resunicode, &i_newpos)) {
5451 Py_DECREF(restuple);
5452 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005454 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005456 else
5457 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005458 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5460 Py_DECREF(restuple);
5461 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005462 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005463 Py_INCREF(resunicode);
5464 Py_DECREF(restuple);
5465 return resunicode;
5466}
5467
5468/* Lookup the character ch in the mapping and put the result in result,
5469 which must be decrefed by the caller.
5470 Return 0 on success, -1 on error */
5471static
5472int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5473{
Christian Heimes217cfd12007-12-02 14:31:20 +00005474 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 PyObject *x;
5476
5477 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 x = PyObject_GetItem(mapping, w);
5480 Py_DECREF(w);
5481 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5483 /* No mapping found means: use 1:1 mapping. */
5484 PyErr_Clear();
5485 *result = NULL;
5486 return 0;
5487 } else
5488 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489 }
5490 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 *result = x;
5492 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005494 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 long value = PyLong_AS_LONG(x);
5496 long max = PyUnicode_GetMax();
5497 if (value < 0 || value > max) {
5498 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005499 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 Py_DECREF(x);
5501 return -1;
5502 }
5503 *result = x;
5504 return 0;
5505 }
5506 else if (PyUnicode_Check(x)) {
5507 *result = x;
5508 return 0;
5509 }
5510 else {
5511 /* wrong return value */
5512 PyErr_SetString(PyExc_TypeError,
5513 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005514 Py_DECREF(x);
5515 return -1;
5516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517}
5518/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 if not reallocate and adjust various state variables.
5520 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005521static
Walter Dörwald4894c302003-10-24 14:25:28 +00005522int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005525 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005526 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 /* remember old output position */
5528 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5529 /* exponentially overallocate to minimize reallocations */
5530 if (requiredsize < 2 * oldsize)
5531 requiredsize = 2 * oldsize;
5532 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5533 return -1;
5534 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005535 }
5536 return 0;
5537}
5538/* lookup the character, put the result in the output string and adjust
5539 various state variables. Return a new reference to the object that
5540 was put in the output buffer in *result, or Py_None, if the mapping was
5541 undefined (in which case no character was written).
5542 The called must decref result.
5543 Return 0 on success, -1 on error. */
5544static
Walter Dörwald4894c302003-10-24 14:25:28 +00005545int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5547 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005548{
Walter Dörwald4894c302003-10-24 14:25:28 +00005549 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 /* not found => default to 1:1 mapping */
5553 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 }
5555 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005557 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 /* no overflow check, because we know that the space is enough */
5559 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560 }
5561 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5563 if (repsize==1) {
5564 /* no overflow check, because we know that the space is enough */
5565 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5566 }
5567 else if (repsize!=0) {
5568 /* more than one character */
5569 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5570 (insize - (curinp-startinp)) +
5571 repsize - 1;
5572 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5573 return -1;
5574 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5575 *outp += repsize;
5576 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577 }
5578 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 return 0;
5581}
5582
5583PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005584 Py_ssize_t size,
5585 PyObject *mapping,
5586 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588 /* output object */
5589 PyObject *res = NULL;
5590 /* pointers to the beginning and end+1 of input */
5591 const Py_UNICODE *startp = p;
5592 const Py_UNICODE *endp = p + size;
5593 /* pointer into the output */
5594 Py_UNICODE *str;
5595 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005596 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 char *reason = "character maps to <undefined>";
5598 PyObject *errorHandler = NULL;
5599 PyObject *exc = NULL;
5600 /* the following variable is used for caching string comparisons
5601 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5602 * 3=ignore, 4=xmlcharrefreplace */
5603 int known_errorHandler = -1;
5604
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 PyErr_BadArgument();
5607 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005609
5610 /* allocate enough for a simple 1:1 translation without
5611 replacements, if we need more, we'll resize */
5612 res = PyUnicode_FromUnicode(NULL, size);
5613 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 /* try to encode it */
5621 PyObject *x = NULL;
5622 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5623 Py_XDECREF(x);
5624 goto onError;
5625 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005626 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 if (x!=Py_None) /* it worked => adjust input pointer */
5628 ++p;
5629 else { /* untranslatable character */
5630 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5631 Py_ssize_t repsize;
5632 Py_ssize_t newpos;
5633 Py_UNICODE *uni2;
5634 /* startpos for collecting untranslatable chars */
5635 const Py_UNICODE *collstart = p;
5636 const Py_UNICODE *collend = p+1;
5637 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 /* find all untranslatable characters */
5640 while (collend < endp) {
5641 if (charmaptranslate_lookup(*collend, mapping, &x))
5642 goto onError;
5643 Py_XDECREF(x);
5644 if (x!=Py_None)
5645 break;
5646 ++collend;
5647 }
5648 /* cache callback name lookup
5649 * (if not done yet, i.e. it's the first error) */
5650 if (known_errorHandler==-1) {
5651 if ((errors==NULL) || (!strcmp(errors, "strict")))
5652 known_errorHandler = 1;
5653 else if (!strcmp(errors, "replace"))
5654 known_errorHandler = 2;
5655 else if (!strcmp(errors, "ignore"))
5656 known_errorHandler = 3;
5657 else if (!strcmp(errors, "xmlcharrefreplace"))
5658 known_errorHandler = 4;
5659 else
5660 known_errorHandler = 0;
5661 }
5662 switch (known_errorHandler) {
5663 case 1: /* strict */
5664 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005665 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 case 2: /* replace */
5667 /* No need to check for space, this is a 1:1 replacement */
5668 for (coll = collstart; coll<collend; ++coll)
5669 *str++ = '?';
5670 /* fall through */
5671 case 3: /* ignore */
5672 p = collend;
5673 break;
5674 case 4: /* xmlcharrefreplace */
5675 /* generate replacement (temporarily (mis)uses p) */
5676 for (p = collstart; p < collend; ++p) {
5677 char buffer[2+29+1+1];
5678 char *cp;
5679 sprintf(buffer, "&#%d;", (int)*p);
5680 if (charmaptranslate_makespace(&res, &str,
5681 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5682 goto onError;
5683 for (cp = buffer; *cp; ++cp)
5684 *str++ = *cp;
5685 }
5686 p = collend;
5687 break;
5688 default:
5689 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5690 reason, startp, size, &exc,
5691 collstart-startp, collend-startp, &newpos);
5692 if (repunicode == NULL)
5693 goto onError;
5694 /* generate replacement */
5695 repsize = PyUnicode_GET_SIZE(repunicode);
5696 if (charmaptranslate_makespace(&res, &str,
5697 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5698 Py_DECREF(repunicode);
5699 goto onError;
5700 }
5701 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5702 *str++ = *uni2;
5703 p = startp + newpos;
5704 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005705 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005706 }
5707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 /* Resize if we allocated to much */
5709 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005710 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 if (PyUnicode_Resize(&res, respos) < 0)
5712 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 }
5714 Py_XDECREF(exc);
5715 Py_XDECREF(errorHandler);
5716 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719 Py_XDECREF(res);
5720 Py_XDECREF(exc);
5721 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 return NULL;
5723}
5724
5725PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 PyObject *mapping,
5727 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728{
5729 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005730
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 str = PyUnicode_FromObject(str);
5732 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 PyUnicode_GET_SIZE(str),
5736 mapping,
5737 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 Py_DECREF(str);
5739 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005740
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 Py_XDECREF(str);
5743 return NULL;
5744}
Tim Petersced69f82003-09-16 20:30:58 +00005745
Guido van Rossum9e896b32000-04-05 20:11:21 +00005746/* --- Decimal Encoder ---------------------------------------------------- */
5747
5748int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 Py_ssize_t length,
5750 char *output,
5751 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005752{
5753 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754 PyObject *errorHandler = NULL;
5755 PyObject *exc = NULL;
5756 const char *encoding = "decimal";
5757 const char *reason = "invalid decimal Unicode string";
5758 /* the following variable is used for caching string comparisons
5759 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5760 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005761
5762 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 PyErr_BadArgument();
5764 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005765 }
5766
5767 p = s;
5768 end = s + length;
5769 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 register Py_UNICODE ch = *p;
5771 int decimal;
5772 PyObject *repunicode;
5773 Py_ssize_t repsize;
5774 Py_ssize_t newpos;
5775 Py_UNICODE *uni2;
5776 Py_UNICODE *collstart;
5777 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005778
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005780 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 ++p;
5782 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005783 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 decimal = Py_UNICODE_TODECIMAL(ch);
5785 if (decimal >= 0) {
5786 *output++ = '0' + decimal;
5787 ++p;
5788 continue;
5789 }
5790 if (0 < ch && ch < 256) {
5791 *output++ = (char)ch;
5792 ++p;
5793 continue;
5794 }
5795 /* All other characters are considered unencodable */
5796 collstart = p;
5797 collend = p+1;
5798 while (collend < end) {
5799 if ((0 < *collend && *collend < 256) ||
5800 !Py_UNICODE_ISSPACE(*collend) ||
5801 Py_UNICODE_TODECIMAL(*collend))
5802 break;
5803 }
5804 /* cache callback name lookup
5805 * (if not done yet, i.e. it's the first error) */
5806 if (known_errorHandler==-1) {
5807 if ((errors==NULL) || (!strcmp(errors, "strict")))
5808 known_errorHandler = 1;
5809 else if (!strcmp(errors, "replace"))
5810 known_errorHandler = 2;
5811 else if (!strcmp(errors, "ignore"))
5812 known_errorHandler = 3;
5813 else if (!strcmp(errors, "xmlcharrefreplace"))
5814 known_errorHandler = 4;
5815 else
5816 known_errorHandler = 0;
5817 }
5818 switch (known_errorHandler) {
5819 case 1: /* strict */
5820 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5821 goto onError;
5822 case 2: /* replace */
5823 for (p = collstart; p < collend; ++p)
5824 *output++ = '?';
5825 /* fall through */
5826 case 3: /* ignore */
5827 p = collend;
5828 break;
5829 case 4: /* xmlcharrefreplace */
5830 /* generate replacement (temporarily (mis)uses p) */
5831 for (p = collstart; p < collend; ++p)
5832 output += sprintf(output, "&#%d;", (int)*p);
5833 p = collend;
5834 break;
5835 default:
5836 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5837 encoding, reason, s, length, &exc,
5838 collstart-s, collend-s, &newpos);
5839 if (repunicode == NULL)
5840 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005841 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005842 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005843 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5844 Py_DECREF(repunicode);
5845 goto onError;
5846 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 /* generate replacement */
5848 repsize = PyUnicode_GET_SIZE(repunicode);
5849 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5850 Py_UNICODE ch = *uni2;
5851 if (Py_UNICODE_ISSPACE(ch))
5852 *output++ = ' ';
5853 else {
5854 decimal = Py_UNICODE_TODECIMAL(ch);
5855 if (decimal >= 0)
5856 *output++ = '0' + decimal;
5857 else if (0 < ch && ch < 256)
5858 *output++ = (char)ch;
5859 else {
5860 Py_DECREF(repunicode);
5861 raise_encode_exception(&exc, encoding,
5862 s, length, collstart-s, collend-s, reason);
5863 goto onError;
5864 }
5865 }
5866 }
5867 p = s + newpos;
5868 Py_DECREF(repunicode);
5869 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005870 }
5871 /* 0-terminate the output string */
5872 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873 Py_XDECREF(exc);
5874 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005875 return 0;
5876
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 Py_XDECREF(exc);
5879 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005880 return -1;
5881}
5882
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883/* --- Helpers ------------------------------------------------------------ */
5884
Eric Smith8c663262007-08-25 02:26:07 +00005885#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005886#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005887
Thomas Wouters477c8d52006-05-27 19:21:47 +00005888#include "stringlib/count.h"
5889#include "stringlib/find.h"
5890#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005891#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005892
Eric Smith5807c412008-05-11 21:00:57 +00005893#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005894#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005895#include "stringlib/localeutil.h"
5896
Thomas Wouters477c8d52006-05-27 19:21:47 +00005897/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005898#define ADJUST_INDICES(start, end, len) \
5899 if (end > len) \
5900 end = len; \
5901 else if (end < 0) { \
5902 end += len; \
5903 if (end < 0) \
5904 end = 0; \
5905 } \
5906 if (start < 0) { \
5907 start += len; \
5908 if (start < 0) \
5909 start = 0; \
5910 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005911
Martin v. Löwis18e16552006-02-15 17:27:45 +00005912Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005913 PyObject *substr,
5914 Py_ssize_t start,
5915 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005917 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005918 PyUnicodeObject* str_obj;
5919 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005920
Thomas Wouters477c8d52006-05-27 19:21:47 +00005921 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5922 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005924 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5925 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 Py_DECREF(str_obj);
5927 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 }
Tim Petersced69f82003-09-16 20:30:58 +00005929
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005930 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005931 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005932 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5933 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005934 );
5935
5936 Py_DECREF(sub_obj);
5937 Py_DECREF(str_obj);
5938
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 return result;
5940}
5941
Martin v. Löwis18e16552006-02-15 17:27:45 +00005942Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005943 PyObject *sub,
5944 Py_ssize_t start,
5945 Py_ssize_t end,
5946 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005948 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005949
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005951 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005953 sub = PyUnicode_FromObject(sub);
5954 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 Py_DECREF(str);
5956 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 }
Tim Petersced69f82003-09-16 20:30:58 +00005958
Thomas Wouters477c8d52006-05-27 19:21:47 +00005959 if (direction > 0)
5960 result = stringlib_find_slice(
5961 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5962 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5963 start, end
5964 );
5965 else
5966 result = stringlib_rfind_slice(
5967 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5968 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5969 start, end
5970 );
5971
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005973 Py_DECREF(sub);
5974
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 return result;
5976}
5977
Tim Petersced69f82003-09-16 20:30:58 +00005978static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 PyUnicodeObject *substring,
5981 Py_ssize_t start,
5982 Py_ssize_t end,
5983 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 if (substring->length == 0)
5986 return 1;
5987
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005988 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 end -= substring->length;
5990 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992
5993 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 if (Py_UNICODE_MATCH(self, end, substring))
5995 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 } else {
5997 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 }
6000
6001 return 0;
6002}
6003
Martin v. Löwis18e16552006-02-15 17:27:45 +00006004Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 PyObject *substr,
6006 Py_ssize_t start,
6007 Py_ssize_t end,
6008 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006010 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006011
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 str = PyUnicode_FromObject(str);
6013 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 substr = PyUnicode_FromObject(substr);
6016 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 Py_DECREF(str);
6018 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 }
Tim Petersced69f82003-09-16 20:30:58 +00006020
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 (PyUnicodeObject *)substr,
6023 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 Py_DECREF(str);
6025 Py_DECREF(substr);
6026 return result;
6027}
6028
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029/* Apply fixfct filter to the Unicode object self and return a
6030 reference to the modified object */
6031
Tim Petersced69f82003-09-16 20:30:58 +00006032static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035{
6036
6037 PyUnicodeObject *u;
6038
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006039 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006042
6043 Py_UNICODE_COPY(u->str, self->str, self->length);
6044
Tim Peters7a29bd52001-09-12 03:03:31 +00006045 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 /* fixfct should return TRUE if it modified the buffer. If
6047 FALSE, return a reference to the original buffer instead
6048 (to save space, not time) */
6049 Py_INCREF(self);
6050 Py_DECREF(u);
6051 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 }
6053 return (PyObject*) u;
6054}
6055
Tim Petersced69f82003-09-16 20:30:58 +00006056static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057int fixupper(PyUnicodeObject *self)
6058{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 Py_UNICODE *s = self->str;
6061 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006065
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 ch = Py_UNICODE_TOUPPER(*s);
6067 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 *s = ch;
6070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 s++;
6072 }
6073
6074 return status;
6075}
6076
Tim Petersced69f82003-09-16 20:30:58 +00006077static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078int fixlower(PyUnicodeObject *self)
6079{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 Py_UNICODE *s = self->str;
6082 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006083
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006086
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 ch = Py_UNICODE_TOLOWER(*s);
6088 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 *s = ch;
6091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 s++;
6093 }
6094
6095 return status;
6096}
6097
Tim Petersced69f82003-09-16 20:30:58 +00006098static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099int fixswapcase(PyUnicodeObject *self)
6100{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006101 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 Py_UNICODE *s = self->str;
6103 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006104
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 while (len-- > 0) {
6106 if (Py_UNICODE_ISUPPER(*s)) {
6107 *s = Py_UNICODE_TOLOWER(*s);
6108 status = 1;
6109 } else if (Py_UNICODE_ISLOWER(*s)) {
6110 *s = Py_UNICODE_TOUPPER(*s);
6111 status = 1;
6112 }
6113 s++;
6114 }
6115
6116 return status;
6117}
6118
Tim Petersced69f82003-09-16 20:30:58 +00006119static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120int fixcapitalize(PyUnicodeObject *self)
6121{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006122 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006123 Py_UNICODE *s = self->str;
6124 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006125
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006126 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006128 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 *s = Py_UNICODE_TOUPPER(*s);
6130 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006132 s++;
6133 while (--len > 0) {
6134 if (Py_UNICODE_ISUPPER(*s)) {
6135 *s = Py_UNICODE_TOLOWER(*s);
6136 status = 1;
6137 }
6138 s++;
6139 }
6140 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141}
6142
6143static
6144int fixtitle(PyUnicodeObject *self)
6145{
6146 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6147 register Py_UNICODE *e;
6148 int previous_is_cased;
6149
6150 /* Shortcut for single character strings */
6151 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6153 if (*p != ch) {
6154 *p = ch;
6155 return 1;
6156 }
6157 else
6158 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 }
Tim Petersced69f82003-09-16 20:30:58 +00006160
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 e = p + PyUnicode_GET_SIZE(self);
6162 previous_is_cased = 0;
6163 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006165
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 if (previous_is_cased)
6167 *p = Py_UNICODE_TOLOWER(ch);
6168 else
6169 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006170
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 if (Py_UNICODE_ISLOWER(ch) ||
6172 Py_UNICODE_ISUPPER(ch) ||
6173 Py_UNICODE_ISTITLE(ch))
6174 previous_is_cased = 1;
6175 else
6176 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 }
6178 return 1;
6179}
6180
Tim Peters8ce9f162004-08-27 01:49:32 +00006181PyObject *
6182PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183{
Skip Montanaro6543b452004-09-16 03:28:13 +00006184 const Py_UNICODE blank = ' ';
6185 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006186 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006187 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006188 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6189 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006190 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6191 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006192 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006193 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194
Tim Peters05eba1f2004-08-27 21:32:02 +00006195 fseq = PySequence_Fast(seq, "");
6196 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006197 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006198 }
6199
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006200 /* NOTE: the following code can't call back into Python code,
6201 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006202 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006203
Tim Peters05eba1f2004-08-27 21:32:02 +00006204 seqlen = PySequence_Fast_GET_SIZE(fseq);
6205 /* If empty sequence, return u"". */
6206 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006207 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6208 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006209 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006210 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006211 /* If singleton sequence with an exact Unicode, return that. */
6212 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 item = items[0];
6214 if (PyUnicode_CheckExact(item)) {
6215 Py_INCREF(item);
6216 res = (PyUnicodeObject *)item;
6217 goto Done;
6218 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006219 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006220 else {
6221 /* Set up sep and seplen */
6222 if (separator == NULL) {
6223 sep = &blank;
6224 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006225 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006226 else {
6227 if (!PyUnicode_Check(separator)) {
6228 PyErr_Format(PyExc_TypeError,
6229 "separator: expected str instance,"
6230 " %.80s found",
6231 Py_TYPE(separator)->tp_name);
6232 goto onError;
6233 }
6234 sep = PyUnicode_AS_UNICODE(separator);
6235 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006236 }
6237 }
6238
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006239 /* There are at least two things to join, or else we have a subclass
6240 * of str in the sequence.
6241 * Do a pre-pass to figure out the total amount of space we'll
6242 * need (sz), and see whether all argument are strings.
6243 */
6244 sz = 0;
6245 for (i = 0; i < seqlen; i++) {
6246 const Py_ssize_t old_sz = sz;
6247 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 if (!PyUnicode_Check(item)) {
6249 PyErr_Format(PyExc_TypeError,
6250 "sequence item %zd: expected str instance,"
6251 " %.80s found",
6252 i, Py_TYPE(item)->tp_name);
6253 goto onError;
6254 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006255 sz += PyUnicode_GET_SIZE(item);
6256 if (i != 0)
6257 sz += seplen;
6258 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6259 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006261 goto onError;
6262 }
6263 }
Tim Petersced69f82003-09-16 20:30:58 +00006264
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006265 res = _PyUnicode_New(sz);
6266 if (res == NULL)
6267 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006268
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006269 /* Catenate everything. */
6270 res_p = PyUnicode_AS_UNICODE(res);
6271 for (i = 0; i < seqlen; ++i) {
6272 Py_ssize_t itemlen;
6273 item = items[i];
6274 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 /* Copy item, and maybe the separator. */
6276 if (i) {
6277 Py_UNICODE_COPY(res_p, sep, seplen);
6278 res_p += seplen;
6279 }
6280 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6281 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006282 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006283
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006285 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 return (PyObject *)res;
6287
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006289 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006290 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 return NULL;
6292}
6293
Tim Petersced69f82003-09-16 20:30:58 +00006294static
6295PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 Py_ssize_t left,
6297 Py_ssize_t right,
6298 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299{
6300 PyUnicodeObject *u;
6301
6302 if (left < 0)
6303 left = 0;
6304 if (right < 0)
6305 right = 0;
6306
Tim Peters7a29bd52001-09-12 03:03:31 +00006307 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 Py_INCREF(self);
6309 return self;
6310 }
6311
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006312 if (left > PY_SSIZE_T_MAX - self->length ||
6313 right > PY_SSIZE_T_MAX - (left + self->length)) {
6314 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6315 return NULL;
6316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 u = _PyUnicode_New(left + self->length + right);
6318 if (u) {
6319 if (left)
6320 Py_UNICODE_FILL(u->str, fill, left);
6321 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6322 if (right)
6323 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6324 }
6325
6326 return u;
6327}
6328
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006329PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
6333 string = PyUnicode_FromObject(string);
6334 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006337 list = stringlib_splitlines(
6338 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6339 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340
6341 Py_DECREF(string);
6342 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
Tim Petersced69f82003-09-16 20:30:58 +00006345static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 PyUnicodeObject *substring,
6348 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006351 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006354 return stringlib_split_whitespace(
6355 (PyObject*) self, self->str, self->length, maxcount
6356 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006358 return stringlib_split(
6359 (PyObject*) self, self->str, self->length,
6360 substring->str, substring->length,
6361 maxcount
6362 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363}
6364
Tim Petersced69f82003-09-16 20:30:58 +00006365static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006366PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 PyUnicodeObject *substring,
6368 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006369{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006370 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006371 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006372
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006373 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006374 return stringlib_rsplit_whitespace(
6375 (PyObject*) self, self->str, self->length, maxcount
6376 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006377
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006378 return stringlib_rsplit(
6379 (PyObject*) self, self->str, self->length,
6380 substring->str, substring->length,
6381 maxcount
6382 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006383}
6384
6385static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 PyUnicodeObject *str1,
6388 PyUnicodeObject *str2,
6389 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390{
6391 PyUnicodeObject *u;
6392
6393 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006395 else if (maxcount == 0 || self->length == 0)
6396 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
Thomas Wouters477c8d52006-05-27 19:21:47 +00006398 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006399 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006400 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006401 if (str1->length == 0)
6402 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006403 if (str1->length == 1) {
6404 /* replace characters */
6405 Py_UNICODE u1, u2;
6406 if (!findchar(self->str, self->length, str1->str[0]))
6407 goto nothing;
6408 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6409 if (!u)
6410 return NULL;
6411 Py_UNICODE_COPY(u->str, self->str, self->length);
6412 u1 = str1->str[0];
6413 u2 = str2->str[0];
6414 for (i = 0; i < u->length; i++)
6415 if (u->str[i] == u1) {
6416 if (--maxcount < 0)
6417 break;
6418 u->str[i] = u2;
6419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006421 i = stringlib_find(
6422 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006424 if (i < 0)
6425 goto nothing;
6426 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6427 if (!u)
6428 return NULL;
6429 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006430
6431 /* change everything in-place, starting with this one */
6432 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6433 i += str1->length;
6434
6435 while ( --maxcount > 0) {
6436 i = stringlib_find(self->str+i, self->length-i,
6437 str1->str, str1->length,
6438 i);
6439 if (i == -1)
6440 break;
6441 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6442 i += str1->length;
6443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006446
6447 Py_ssize_t n, i, j, e;
6448 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 Py_UNICODE *p;
6450
6451 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006452 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6453 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006454 if (n == 0)
6455 goto nothing;
6456 /* new_size = self->length + n * (str2->length - str1->length)); */
6457 delta = (str2->length - str1->length);
6458 if (delta == 0) {
6459 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006461 product = n * (str2->length - str1->length);
6462 if ((product / (str2->length - str1->length)) != n) {
6463 PyErr_SetString(PyExc_OverflowError,
6464 "replace string is too long");
6465 return NULL;
6466 }
6467 new_size = self->length + product;
6468 if (new_size < 0) {
6469 PyErr_SetString(PyExc_OverflowError,
6470 "replace string is too long");
6471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 }
6473 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006474 u = _PyUnicode_New(new_size);
6475 if (!u)
6476 return NULL;
6477 i = 0;
6478 p = u->str;
6479 e = self->length - str1->length;
6480 if (str1->length > 0) {
6481 while (n-- > 0) {
6482 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006483 j = stringlib_find(self->str+i, self->length-i,
6484 str1->str, str1->length,
6485 i);
6486 if (j == -1)
6487 break;
6488 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006489 /* copy unchanged part [i:j] */
6490 Py_UNICODE_COPY(p, self->str+i, j-i);
6491 p += j - i;
6492 }
6493 /* copy substitution string */
6494 if (str2->length > 0) {
6495 Py_UNICODE_COPY(p, str2->str, str2->length);
6496 p += str2->length;
6497 }
6498 i = j + str1->length;
6499 }
6500 if (i < self->length)
6501 /* copy tail [i:] */
6502 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6503 } else {
6504 /* interleave */
6505 while (n > 0) {
6506 Py_UNICODE_COPY(p, str2->str, str2->length);
6507 p += str2->length;
6508 if (--n <= 0)
6509 break;
6510 *p++ = self->str[i++];
6511 }
6512 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6513 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006518 /* nothing to replace; return original string (when possible) */
6519 if (PyUnicode_CheckExact(self)) {
6520 Py_INCREF(self);
6521 return (PyObject *) self;
6522 }
6523 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524}
6525
6526/* --- Unicode Object Methods --------------------------------------------- */
6527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006528PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530\n\
6531Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006532characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
6534static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006535unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 return fixup(self, fixtitle);
6538}
6539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006540PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542\n\
6543Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006544have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545
6546static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006547unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 return fixup(self, fixcapitalize);
6550}
6551
6552#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006553PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555\n\
6556Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006557normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558
6559static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006560unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561{
6562 PyObject *list;
6563 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006564 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 /* Split into words */
6567 list = split(self, NULL, -1);
6568 if (!list)
6569 return NULL;
6570
6571 /* Capitalize each word */
6572 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6573 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 if (item == NULL)
6576 goto onError;
6577 Py_DECREF(PyList_GET_ITEM(list, i));
6578 PyList_SET_ITEM(list, i, item);
6579 }
6580
6581 /* Join the words to form a new string */
6582 item = PyUnicode_Join(NULL, list);
6583
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 Py_DECREF(list);
6586 return (PyObject *)item;
6587}
6588#endif
6589
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006590/* Argument converter. Coerces to a single unicode character */
6591
6592static int
6593convert_uc(PyObject *obj, void *addr)
6594{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006595 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6596 PyObject *uniobj;
6597 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006598
Benjamin Peterson14339b62009-01-31 16:36:08 +00006599 uniobj = PyUnicode_FromObject(obj);
6600 if (uniobj == NULL) {
6601 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006603 return 0;
6604 }
6605 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6606 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006608 Py_DECREF(uniobj);
6609 return 0;
6610 }
6611 unistr = PyUnicode_AS_UNICODE(uniobj);
6612 *fillcharloc = unistr[0];
6613 Py_DECREF(uniobj);
6614 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006615}
6616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006617PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006620Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006621done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
6623static PyObject *
6624unicode_center(PyUnicodeObject *self, PyObject *args)
6625{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006626 Py_ssize_t marg, left;
6627 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006628 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629
Thomas Woutersde017742006-02-16 19:34:37 +00006630 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 return NULL;
6632
Tim Peters7a29bd52001-09-12 03:03:31 +00006633 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 Py_INCREF(self);
6635 return (PyObject*) self;
6636 }
6637
6638 marg = width - self->length;
6639 left = marg / 2 + (marg & width & 1);
6640
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006641 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642}
6643
Marc-André Lemburge5034372000-08-08 08:04:29 +00006644#if 0
6645
6646/* This code should go into some future Unicode collation support
6647 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006648 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006649
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006650/* speedy UTF-16 code point order comparison */
6651/* gleaned from: */
6652/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6653
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006654static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006655{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006656 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006657 0, 0, 0, 0, 0, 0, 0, 0,
6658 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006659 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006660};
6661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662static int
6663unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6664{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006665 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006666
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 Py_UNICODE *s1 = str1->str;
6668 Py_UNICODE *s2 = str2->str;
6669
6670 len1 = str1->length;
6671 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006672
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006674 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006675
6676 c1 = *s1++;
6677 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006678
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 if (c1 > (1<<11) * 26)
6680 c1 += utf16Fixup[c1>>11];
6681 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006682 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006683 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006684
6685 if (c1 != c2)
6686 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006687
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006688 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 }
6690
6691 return (len1 < len2) ? -1 : (len1 != len2);
6692}
6693
Marc-André Lemburge5034372000-08-08 08:04:29 +00006694#else
6695
6696static int
6697unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006699 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006700
6701 Py_UNICODE *s1 = str1->str;
6702 Py_UNICODE *s2 = str2->str;
6703
6704 len1 = str1->length;
6705 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006706
Marc-André Lemburge5034372000-08-08 08:04:29 +00006707 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006708 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006709
Fredrik Lundh45714e92001-06-26 16:39:36 +00006710 c1 = *s1++;
6711 c2 = *s2++;
6712
6713 if (c1 != c2)
6714 return (c1 < c2) ? -1 : 1;
6715
Marc-André Lemburge5034372000-08-08 08:04:29 +00006716 len1--; len2--;
6717 }
6718
6719 return (len1 < len2) ? -1 : (len1 != len2);
6720}
6721
6722#endif
6723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006727 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6728 return unicode_compare((PyUnicodeObject *)left,
6729 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006730 PyErr_Format(PyExc_TypeError,
6731 "Can't compare %.100s and %.100s",
6732 left->ob_type->tp_name,
6733 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 return -1;
6735}
6736
Martin v. Löwis5b222132007-06-10 09:51:05 +00006737int
6738PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6739{
6740 int i;
6741 Py_UNICODE *id;
6742 assert(PyUnicode_Check(uni));
6743 id = PyUnicode_AS_UNICODE(uni);
6744 /* Compare Unicode string and source character set string */
6745 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 if (id[i] != str[i])
6747 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006748 /* This check keeps Python strings that end in '\0' from comparing equal
6749 to C strings identical up to that point. */
6750 if (PyUnicode_GET_SIZE(uni) != i)
6751 /* We'll say the Python string is longer. */
6752 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006753 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006755 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006757 return 0;
6758}
6759
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006760
Benjamin Peterson29060642009-01-31 22:14:21 +00006761#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006762 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006763
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006764PyObject *PyUnicode_RichCompare(PyObject *left,
6765 PyObject *right,
6766 int op)
6767{
6768 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006769
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006770 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6771 PyObject *v;
6772 if (((PyUnicodeObject *) left)->length !=
6773 ((PyUnicodeObject *) right)->length) {
6774 if (op == Py_EQ) {
6775 Py_INCREF(Py_False);
6776 return Py_False;
6777 }
6778 if (op == Py_NE) {
6779 Py_INCREF(Py_True);
6780 return Py_True;
6781 }
6782 }
6783 if (left == right)
6784 result = 0;
6785 else
6786 result = unicode_compare((PyUnicodeObject *)left,
6787 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006788
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006789 /* Convert the return value to a Boolean */
6790 switch (op) {
6791 case Py_EQ:
6792 v = TEST_COND(result == 0);
6793 break;
6794 case Py_NE:
6795 v = TEST_COND(result != 0);
6796 break;
6797 case Py_LE:
6798 v = TEST_COND(result <= 0);
6799 break;
6800 case Py_GE:
6801 v = TEST_COND(result >= 0);
6802 break;
6803 case Py_LT:
6804 v = TEST_COND(result == -1);
6805 break;
6806 case Py_GT:
6807 v = TEST_COND(result == 1);
6808 break;
6809 default:
6810 PyErr_BadArgument();
6811 return NULL;
6812 }
6813 Py_INCREF(v);
6814 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006815 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006816
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006817 Py_INCREF(Py_NotImplemented);
6818 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006819}
6820
Guido van Rossum403d68b2000-03-13 15:55:09 +00006821int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006823{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006824 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006825 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006826
6827 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006828 sub = PyUnicode_FromObject(element);
6829 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 PyErr_Format(PyExc_TypeError,
6831 "'in <string>' requires string as left operand, not %s",
6832 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006833 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006834 }
6835
Thomas Wouters477c8d52006-05-27 19:21:47 +00006836 str = PyUnicode_FromObject(container);
6837 if (!str) {
6838 Py_DECREF(sub);
6839 return -1;
6840 }
6841
6842 result = stringlib_contains_obj(str, sub);
6843
6844 Py_DECREF(str);
6845 Py_DECREF(sub);
6846
Guido van Rossum403d68b2000-03-13 15:55:09 +00006847 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006848}
6849
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850/* Concat to string or Unicode object giving a new Unicode object. */
6851
6852PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854{
6855 PyUnicodeObject *u = NULL, *v = NULL, *w;
6856
6857 /* Coerce the two arguments */
6858 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6859 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6862 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864
6865 /* Shortcuts */
6866 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 Py_DECREF(v);
6868 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 }
6870 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 Py_DECREF(u);
6872 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 }
6874
6875 /* Concat the two Unicode strings */
6876 w = _PyUnicode_New(u->length + v->length);
6877 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 Py_UNICODE_COPY(w->str, u->str, u->length);
6880 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6881
6882 Py_DECREF(u);
6883 Py_DECREF(v);
6884 return (PyObject *)w;
6885
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 Py_XDECREF(u);
6888 Py_XDECREF(v);
6889 return NULL;
6890}
6891
Walter Dörwald1ab83302007-05-18 17:15:44 +00006892void
6893PyUnicode_Append(PyObject **pleft, PyObject *right)
6894{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006895 PyObject *new;
6896 if (*pleft == NULL)
6897 return;
6898 if (right == NULL || !PyUnicode_Check(*pleft)) {
6899 Py_DECREF(*pleft);
6900 *pleft = NULL;
6901 return;
6902 }
6903 new = PyUnicode_Concat(*pleft, right);
6904 Py_DECREF(*pleft);
6905 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006906}
6907
6908void
6909PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6910{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006911 PyUnicode_Append(pleft, right);
6912 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006913}
6914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006915PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006918Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006919string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006920interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921
6922static PyObject *
6923unicode_count(PyUnicodeObject *self, PyObject *args)
6924{
6925 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006926 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006927 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 PyObject *result;
6929
Guido van Rossumb8872e62000-05-09 14:14:27 +00006930 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 return NULL;
6933
6934 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006935 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006938
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006939 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006940 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006941 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006942 substring->str, substring->length,
6943 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00006944 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
6946 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006947
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 return result;
6949}
6950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006951PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006954Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006955to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006956handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006957a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6958'xmlcharrefreplace' as well as any other name registered with\n\
6959codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
6961static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00006962unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963{
Benjamin Peterson308d6372009-09-18 21:42:35 +00006964 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 char *encoding = NULL;
6966 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006967 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006968
Benjamin Peterson308d6372009-09-18 21:42:35 +00006969 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6970 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006972 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006973 if (v == NULL)
6974 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006975 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006976 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006977 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006978 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006979 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006980 Py_DECREF(v);
6981 return NULL;
6982 }
6983 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006984
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006986 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006987}
6988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006989PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991\n\
6992Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006993If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
6995static PyObject*
6996unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6997{
6998 Py_UNICODE *e;
6999 Py_UNICODE *p;
7000 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007001 Py_UNICODE *qe;
7002 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 PyUnicodeObject *u;
7004 int tabsize = 8;
7005
7006 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008
Thomas Wouters7e474022000-07-16 12:04:32 +00007009 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007010 i = 0; /* chars up to and including most recent \n or \r */
7011 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7012 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 for (p = self->str; p < e; p++)
7014 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 if (tabsize > 0) {
7016 incr = tabsize - (j % tabsize); /* cannot overflow */
7017 if (j > PY_SSIZE_T_MAX - incr)
7018 goto overflow1;
7019 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007020 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 if (j > PY_SSIZE_T_MAX - 1)
7024 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 j++;
7026 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007027 if (i > PY_SSIZE_T_MAX - j)
7028 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007030 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 }
7032 }
7033
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007034 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007036
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 /* Second pass: create output string and fill it */
7038 u = _PyUnicode_New(i + j);
7039 if (!u)
7040 return NULL;
7041
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007042 j = 0; /* same as in first pass */
7043 q = u->str; /* next output char */
7044 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045
7046 for (p = self->str; p < e; p++)
7047 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 if (tabsize > 0) {
7049 i = tabsize - (j % tabsize);
7050 j += i;
7051 while (i--) {
7052 if (q >= qe)
7053 goto overflow2;
7054 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007055 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 else {
7059 if (q >= qe)
7060 goto overflow2;
7061 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007062 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 if (*p == '\n' || *p == '\r')
7064 j = 0;
7065 }
7066
7067 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007068
7069 overflow2:
7070 Py_DECREF(u);
7071 overflow1:
7072 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074}
7075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007076PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078\n\
7079Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007080such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081arguments start and end are interpreted as in slice notation.\n\
7082\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007083Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084
7085static PyObject *
7086unicode_find(PyUnicodeObject *self, PyObject *args)
7087{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007088 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007089 Py_ssize_t start;
7090 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007091 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092
Christian Heimes9cd17752007-11-18 19:35:23 +00007093 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
Thomas Wouters477c8d52006-05-27 19:21:47 +00007096 result = stringlib_find_slice(
7097 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7098 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7099 start, end
7100 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101
7102 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007103
Christian Heimes217cfd12007-12-02 14:31:20 +00007104 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105}
7106
7107static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007108unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109{
7110 if (index < 0 || index >= self->length) {
7111 PyErr_SetString(PyExc_IndexError, "string index out of range");
7112 return NULL;
7113 }
7114
7115 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7116}
7117
Guido van Rossumc2504932007-09-18 19:42:40 +00007118/* Believe it or not, this produces the same value for ASCII strings
7119 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007121unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122{
Guido van Rossumc2504932007-09-18 19:42:40 +00007123 Py_ssize_t len;
7124 Py_UNICODE *p;
7125 long x;
7126
7127 if (self->hash != -1)
7128 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007129 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007130 p = self->str;
7131 x = *p << 7;
7132 while (--len >= 0)
7133 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007134 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007135 if (x == -1)
7136 x = -2;
7137 self->hash = x;
7138 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139}
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145
7146static PyObject *
7147unicode_index(PyUnicodeObject *self, PyObject *args)
7148{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007149 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007150 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007151 Py_ssize_t start;
7152 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153
Christian Heimes9cd17752007-11-18 19:35:23 +00007154 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
Thomas Wouters477c8d52006-05-27 19:21:47 +00007157 result = stringlib_find_slice(
7158 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7159 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7160 start, end
7161 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162
7163 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007164
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 if (result < 0) {
7166 PyErr_SetString(PyExc_ValueError, "substring not found");
7167 return NULL;
7168 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007169
Christian Heimes217cfd12007-12-02 14:31:20 +00007170 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171}
7172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007173PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007176Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007177at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178
7179static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007180unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181{
7182 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7183 register const Py_UNICODE *e;
7184 int cased;
7185
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 /* Shortcut for single character strings */
7187 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007190 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007191 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007193
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194 e = p + PyUnicode_GET_SIZE(self);
7195 cased = 0;
7196 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007198
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7200 return PyBool_FromLong(0);
7201 else if (!cased && Py_UNICODE_ISLOWER(ch))
7202 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007204 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205}
7206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007207PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007210Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007211at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
7213static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007214unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215{
7216 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7217 register const Py_UNICODE *e;
7218 int cased;
7219
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 /* Shortcut for single character strings */
7221 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007224 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007225 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007227
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228 e = p + PyUnicode_GET_SIZE(self);
7229 cased = 0;
7230 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007232
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7234 return PyBool_FromLong(0);
7235 else if (!cased && Py_UNICODE_ISUPPER(ch))
7236 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007238 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239}
7240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007241PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007244Return True if S is a titlecased string and there is at least one\n\
7245character in S, i.e. upper- and titlecase characters may only\n\
7246follow uncased characters and lowercase characters only cased ones.\n\
7247Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
7249static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007250unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251{
7252 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7253 register const Py_UNICODE *e;
7254 int cased, previous_is_cased;
7255
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 /* Shortcut for single character strings */
7257 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7259 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007261 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007262 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007264
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 e = p + PyUnicode_GET_SIZE(self);
7266 cased = 0;
7267 previous_is_cased = 0;
7268 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007270
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7272 if (previous_is_cased)
7273 return PyBool_FromLong(0);
7274 previous_is_cased = 1;
7275 cased = 1;
7276 }
7277 else if (Py_UNICODE_ISLOWER(ch)) {
7278 if (!previous_is_cased)
7279 return PyBool_FromLong(0);
7280 previous_is_cased = 1;
7281 cased = 1;
7282 }
7283 else
7284 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007286 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287}
7288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007289PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007290 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007292Return True if all characters in S are whitespace\n\
7293and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294
7295static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007296unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297{
7298 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7299 register const Py_UNICODE *e;
7300
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 /* Shortcut for single character strings */
7302 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 Py_UNICODE_ISSPACE(*p))
7304 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007306 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007307 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007309
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 e = p + PyUnicode_GET_SIZE(self);
7311 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 if (!Py_UNICODE_ISSPACE(*p))
7313 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007315 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316}
7317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007318PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007320\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007321Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007322and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007323
7324static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007325unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007326{
7327 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7328 register const Py_UNICODE *e;
7329
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007330 /* Shortcut for single character strings */
7331 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 Py_UNICODE_ISALPHA(*p))
7333 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007334
7335 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007336 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007338
7339 e = p + PyUnicode_GET_SIZE(self);
7340 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 if (!Py_UNICODE_ISALPHA(*p))
7342 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007343 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007344 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007345}
7346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007347PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007349\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007350Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007351and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007352
7353static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007354unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007355{
7356 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7357 register const Py_UNICODE *e;
7358
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007359 /* Shortcut for single character strings */
7360 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 Py_UNICODE_ISALNUM(*p))
7362 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007363
7364 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007365 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007367
7368 e = p + PyUnicode_GET_SIZE(self);
7369 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 if (!Py_UNICODE_ISALNUM(*p))
7371 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007372 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007373 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007374}
7375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007376PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007379Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007380False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381
7382static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007383unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384{
7385 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7386 register const Py_UNICODE *e;
7387
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 /* Shortcut for single character strings */
7389 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 Py_UNICODE_ISDECIMAL(*p))
7391 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007393 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007394 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007396
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 e = p + PyUnicode_GET_SIZE(self);
7398 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 if (!Py_UNICODE_ISDECIMAL(*p))
7400 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007402 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403}
7404
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007405PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007408Return True if all characters in S are digits\n\
7409and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
7411static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007412unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413{
7414 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7415 register const Py_UNICODE *e;
7416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 /* Shortcut for single character strings */
7418 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 Py_UNICODE_ISDIGIT(*p))
7420 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007422 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007423 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007425
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 e = p + PyUnicode_GET_SIZE(self);
7427 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 if (!Py_UNICODE_ISDIGIT(*p))
7429 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007431 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432}
7433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007434PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007437Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007438False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439
7440static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007441unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442{
7443 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7444 register const Py_UNICODE *e;
7445
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 /* Shortcut for single character strings */
7447 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 Py_UNICODE_ISNUMERIC(*p))
7449 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007451 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007452 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007454
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 e = p + PyUnicode_GET_SIZE(self);
7456 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 if (!Py_UNICODE_ISNUMERIC(*p))
7458 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007460 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461}
7462
Martin v. Löwis47383402007-08-15 07:32:56 +00007463int
7464PyUnicode_IsIdentifier(PyObject *self)
7465{
7466 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7467 register const Py_UNICODE *e;
7468
7469 /* Special case for empty strings */
7470 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007472
7473 /* PEP 3131 says that the first character must be in
7474 XID_Start and subsequent characters in XID_Continue,
7475 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007476 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007477 letters, digits, underscore). However, given the current
7478 definition of XID_Start and XID_Continue, it is sufficient
7479 to check just for these, except that _ must be allowed
7480 as starting an identifier. */
7481 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7482 return 0;
7483
7484 e = p + PyUnicode_GET_SIZE(self);
7485 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 if (!_PyUnicode_IsXidContinue(*p))
7487 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007488 }
7489 return 1;
7490}
7491
7492PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007494\n\
7495Return True if S is a valid identifier according\n\
7496to the language definition.");
7497
7498static PyObject*
7499unicode_isidentifier(PyObject *self)
7500{
7501 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7502}
7503
Georg Brandl559e5d72008-06-11 18:37:52 +00007504PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007506\n\
7507Return True if all characters in S are considered\n\
7508printable in repr() or S is empty, False otherwise.");
7509
7510static PyObject*
7511unicode_isprintable(PyObject *self)
7512{
7513 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7514 register const Py_UNICODE *e;
7515
7516 /* Shortcut for single character strings */
7517 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7518 Py_RETURN_TRUE;
7519 }
7520
7521 e = p + PyUnicode_GET_SIZE(self);
7522 for (; p < e; p++) {
7523 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7524 Py_RETURN_FALSE;
7525 }
7526 }
7527 Py_RETURN_TRUE;
7528}
7529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007530PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007531 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532\n\
7533Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007534iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535
7536static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007537unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007539 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540}
7541
Martin v. Löwis18e16552006-02-15 17:27:45 +00007542static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543unicode_length(PyUnicodeObject *self)
7544{
7545 return self->length;
7546}
7547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007548PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007551Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007552done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553
7554static PyObject *
7555unicode_ljust(PyUnicodeObject *self, PyObject *args)
7556{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007557 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007558 Py_UNICODE fillchar = ' ';
7559
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007560 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561 return NULL;
7562
Tim Peters7a29bd52001-09-12 03:03:31 +00007563 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 Py_INCREF(self);
7565 return (PyObject*) self;
7566 }
7567
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007568 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569}
7570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007574Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575
7576static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007577unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 return fixup(self, fixlower);
7580}
7581
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007582#define LEFTSTRIP 0
7583#define RIGHTSTRIP 1
7584#define BOTHSTRIP 2
7585
7586/* Arrays indexed by above */
7587static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7588
7589#define STRIPNAME(i) (stripformat[i]+3)
7590
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007591/* externally visible for str.strip(unicode) */
7592PyObject *
7593_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7594{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007595 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7596 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7597 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7598 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7599 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007600
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007602
Benjamin Peterson14339b62009-01-31 16:36:08 +00007603 i = 0;
7604 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7606 i++;
7607 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007608 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007609
Benjamin Peterson14339b62009-01-31 16:36:08 +00007610 j = len;
7611 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 do {
7613 j--;
7614 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7615 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007616 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007617
Benjamin Peterson14339b62009-01-31 16:36:08 +00007618 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 Py_INCREF(self);
7620 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007621 }
7622 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007624}
7625
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
7627static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007628do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007630 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7631 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007632
Benjamin Peterson14339b62009-01-31 16:36:08 +00007633 i = 0;
7634 if (striptype != RIGHTSTRIP) {
7635 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7636 i++;
7637 }
7638 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007639
Benjamin Peterson14339b62009-01-31 16:36:08 +00007640 j = len;
7641 if (striptype != LEFTSTRIP) {
7642 do {
7643 j--;
7644 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7645 j++;
7646 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007647
Benjamin Peterson14339b62009-01-31 16:36:08 +00007648 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7649 Py_INCREF(self);
7650 return (PyObject*)self;
7651 }
7652 else
7653 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654}
7655
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007656
7657static PyObject *
7658do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7659{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007660 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007661
Benjamin Peterson14339b62009-01-31 16:36:08 +00007662 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7663 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007664
Benjamin Peterson14339b62009-01-31 16:36:08 +00007665 if (sep != NULL && sep != Py_None) {
7666 if (PyUnicode_Check(sep))
7667 return _PyUnicode_XStrip(self, striptype, sep);
7668 else {
7669 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 "%s arg must be None or str",
7671 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007672 return NULL;
7673 }
7674 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007675
Benjamin Peterson14339b62009-01-31 16:36:08 +00007676 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007677}
7678
7679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007680PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007682\n\
7683Return a copy of the string S with leading and trailing\n\
7684whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007685If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007686
7687static PyObject *
7688unicode_strip(PyUnicodeObject *self, PyObject *args)
7689{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007690 if (PyTuple_GET_SIZE(args) == 0)
7691 return do_strip(self, BOTHSTRIP); /* Common case */
7692 else
7693 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007694}
7695
7696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007697PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007699\n\
7700Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007701If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007702
7703static PyObject *
7704unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007706 if (PyTuple_GET_SIZE(args) == 0)
7707 return do_strip(self, LEFTSTRIP); /* Common case */
7708 else
7709 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007710}
7711
7712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007713PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007715\n\
7716Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007717If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007718
7719static PyObject *
7720unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7721{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007722 if (PyTuple_GET_SIZE(args) == 0)
7723 return do_strip(self, RIGHTSTRIP); /* Common case */
7724 else
7725 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007726}
7727
7728
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007730unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731{
7732 PyUnicodeObject *u;
7733 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007734 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007735 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
Georg Brandl222de0f2009-04-12 12:01:50 +00007737 if (len < 1) {
7738 Py_INCREF(unicode_empty);
7739 return (PyObject *)unicode_empty;
7740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741
Tim Peters7a29bd52001-09-12 03:03:31 +00007742 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 /* no repeat, return original string */
7744 Py_INCREF(str);
7745 return (PyObject*) str;
7746 }
Tim Peters8f422462000-09-09 06:13:41 +00007747
7748 /* ensure # of chars needed doesn't overflow int and # of bytes
7749 * needed doesn't overflow size_t
7750 */
7751 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007752 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007753 PyErr_SetString(PyExc_OverflowError,
7754 "repeated string is too long");
7755 return NULL;
7756 }
7757 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7758 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7759 PyErr_SetString(PyExc_OverflowError,
7760 "repeated string is too long");
7761 return NULL;
7762 }
7763 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764 if (!u)
7765 return NULL;
7766
7767 p = u->str;
7768
Georg Brandl222de0f2009-04-12 12:01:50 +00007769 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007770 Py_UNICODE_FILL(p, str->str[0], len);
7771 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007772 Py_ssize_t done = str->length; /* number of characters copied this far */
7773 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007775 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007776 Py_UNICODE_COPY(p+done, p, n);
7777 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 }
7780
7781 return (PyObject*) u;
7782}
7783
7784PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 PyObject *subobj,
7786 PyObject *replobj,
7787 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788{
7789 PyObject *self;
7790 PyObject *str1;
7791 PyObject *str2;
7792 PyObject *result;
7793
7794 self = PyUnicode_FromObject(obj);
7795 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 str1 = PyUnicode_FromObject(subobj);
7798 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 Py_DECREF(self);
7800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 }
7802 str2 = PyUnicode_FromObject(replobj);
7803 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 Py_DECREF(self);
7805 Py_DECREF(str1);
7806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 }
Tim Petersced69f82003-09-16 20:30:58 +00007808 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 (PyUnicodeObject *)str1,
7810 (PyUnicodeObject *)str2,
7811 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 Py_DECREF(self);
7813 Py_DECREF(str1);
7814 Py_DECREF(str2);
7815 return result;
7816}
7817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007818PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820\n\
7821Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007822old replaced by new. If the optional argument count is\n\
7823given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824
7825static PyObject*
7826unicode_replace(PyUnicodeObject *self, PyObject *args)
7827{
7828 PyUnicodeObject *str1;
7829 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007830 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 PyObject *result;
7832
Martin v. Löwis18e16552006-02-15 17:27:45 +00007833 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 return NULL;
7835 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7836 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007839 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 Py_DECREF(str1);
7841 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843
7844 result = replace(self, str1, str2, maxcount);
7845
7846 Py_DECREF(str1);
7847 Py_DECREF(str2);
7848 return result;
7849}
7850
7851static
7852PyObject *unicode_repr(PyObject *unicode)
7853{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007854 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007855 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007856 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7857 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7858
7859 /* XXX(nnorwitz): rather than over-allocating, it would be
7860 better to choose a different scheme. Perhaps scan the
7861 first N-chars of the string and allocate based on that size.
7862 */
7863 /* Initial allocation is based on the longest-possible unichr
7864 escape.
7865
7866 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7867 unichr, so in this case it's the longest unichr escape. In
7868 narrow (UTF-16) builds this is five chars per source unichr
7869 since there are two unichrs in the surrogate pair, so in narrow
7870 (UTF-16) builds it's not the longest unichr escape.
7871
7872 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7873 so in the narrow (UTF-16) build case it's the longest unichr
7874 escape.
7875 */
7876
Walter Dörwald1ab83302007-05-18 17:15:44 +00007877 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007879#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007881#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007883#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007885 if (repr == NULL)
7886 return NULL;
7887
Walter Dörwald1ab83302007-05-18 17:15:44 +00007888 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007889
7890 /* Add quote */
7891 *p++ = (findchar(s, size, '\'') &&
7892 !findchar(s, size, '"')) ? '"' : '\'';
7893 while (size-- > 0) {
7894 Py_UNICODE ch = *s++;
7895
7896 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007897 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007898 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007899 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007900 continue;
7901 }
7902
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007904 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007905 *p++ = '\\';
7906 *p++ = 't';
7907 }
7908 else if (ch == '\n') {
7909 *p++ = '\\';
7910 *p++ = 'n';
7911 }
7912 else if (ch == '\r') {
7913 *p++ = '\\';
7914 *p++ = 'r';
7915 }
7916
7917 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007918 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007919 *p++ = '\\';
7920 *p++ = 'x';
7921 *p++ = hexdigits[(ch >> 4) & 0x000F];
7922 *p++ = hexdigits[ch & 0x000F];
7923 }
7924
Georg Brandl559e5d72008-06-11 18:37:52 +00007925 /* Copy ASCII characters as-is */
7926 else if (ch < 0x7F) {
7927 *p++ = ch;
7928 }
7929
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007931 else {
7932 Py_UCS4 ucs = ch;
7933
7934#ifndef Py_UNICODE_WIDE
7935 Py_UNICODE ch2 = 0;
7936 /* Get code point from surrogate pair */
7937 if (size > 0) {
7938 ch2 = *s;
7939 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007941 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007943 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007944 size--;
7945 }
7946 }
7947#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007948 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007949 (categories Z* and C* except ASCII space)
7950 */
7951 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7952 /* Map 8-bit characters to '\xhh' */
7953 if (ucs <= 0xff) {
7954 *p++ = '\\';
7955 *p++ = 'x';
7956 *p++ = hexdigits[(ch >> 4) & 0x000F];
7957 *p++ = hexdigits[ch & 0x000F];
7958 }
7959 /* Map 21-bit characters to '\U00xxxxxx' */
7960 else if (ucs >= 0x10000) {
7961 *p++ = '\\';
7962 *p++ = 'U';
7963 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7964 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7965 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7966 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7967 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7968 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7969 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7970 *p++ = hexdigits[ucs & 0x0000000F];
7971 }
7972 /* Map 16-bit characters to '\uxxxx' */
7973 else {
7974 *p++ = '\\';
7975 *p++ = 'u';
7976 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7977 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7978 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7979 *p++ = hexdigits[ucs & 0x000F];
7980 }
7981 }
7982 /* Copy characters as-is */
7983 else {
7984 *p++ = ch;
7985#ifndef Py_UNICODE_WIDE
7986 if (ucs >= 0x10000)
7987 *p++ = ch2;
7988#endif
7989 }
7990 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007991 }
7992 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007993 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007994
7995 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00007996 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007997 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998}
7999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008000PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002\n\
8003Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008004such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005arguments start and end are interpreted as in slice notation.\n\
8006\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008
8009static PyObject *
8010unicode_rfind(PyUnicodeObject *self, PyObject *args)
8011{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008012 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008013 Py_ssize_t start;
8014 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008015 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016
Christian Heimes9cd17752007-11-18 19:35:23 +00008017 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
Thomas Wouters477c8d52006-05-27 19:21:47 +00008020 result = stringlib_rfind_slice(
8021 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8022 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8023 start, end
8024 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
8026 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008027
Christian Heimes217cfd12007-12-02 14:31:20 +00008028 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029}
8030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008031PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008034Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035
8036static PyObject *
8037unicode_rindex(PyUnicodeObject *self, PyObject *args)
8038{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008039 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008040 Py_ssize_t start;
8041 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008042 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043
Christian Heimes9cd17752007-11-18 19:35:23 +00008044 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046
Thomas Wouters477c8d52006-05-27 19:21:47 +00008047 result = stringlib_rfind_slice(
8048 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8049 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8050 start, end
8051 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052
8053 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008054
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 if (result < 0) {
8056 PyErr_SetString(PyExc_ValueError, "substring not found");
8057 return NULL;
8058 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008059 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060}
8061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008062PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008065Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008066done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067
8068static PyObject *
8069unicode_rjust(PyUnicodeObject *self, PyObject *args)
8070{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008071 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008072 Py_UNICODE fillchar = ' ';
8073
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008074 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075 return NULL;
8076
Tim Peters7a29bd52001-09-12 03:03:31 +00008077 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078 Py_INCREF(self);
8079 return (PyObject*) self;
8080 }
8081
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008082 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083}
8084
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 PyObject *sep,
8087 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088{
8089 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008090
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 s = PyUnicode_FromObject(s);
8092 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 if (sep != NULL) {
8095 sep = PyUnicode_FromObject(sep);
8096 if (sep == NULL) {
8097 Py_DECREF(s);
8098 return NULL;
8099 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100 }
8101
8102 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8103
8104 Py_DECREF(s);
8105 Py_XDECREF(sep);
8106 return result;
8107}
8108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008109PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111\n\
8112Return a list of the words in S, using sep as the\n\
8113delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008114splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008115whitespace string is a separator and empty strings are\n\
8116removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117
8118static PyObject*
8119unicode_split(PyUnicodeObject *self, PyObject *args)
8120{
8121 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008122 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123
Martin v. Löwis18e16552006-02-15 17:27:45 +00008124 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125 return NULL;
8126
8127 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133}
8134
Thomas Wouters477c8d52006-05-27 19:21:47 +00008135PyObject *
8136PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8137{
8138 PyObject* str_obj;
8139 PyObject* sep_obj;
8140 PyObject* out;
8141
8142 str_obj = PyUnicode_FromObject(str_in);
8143 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008145 sep_obj = PyUnicode_FromObject(sep_in);
8146 if (!sep_obj) {
8147 Py_DECREF(str_obj);
8148 return NULL;
8149 }
8150
8151 out = stringlib_partition(
8152 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8153 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8154 );
8155
8156 Py_DECREF(sep_obj);
8157 Py_DECREF(str_obj);
8158
8159 return out;
8160}
8161
8162
8163PyObject *
8164PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8165{
8166 PyObject* str_obj;
8167 PyObject* sep_obj;
8168 PyObject* out;
8169
8170 str_obj = PyUnicode_FromObject(str_in);
8171 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008173 sep_obj = PyUnicode_FromObject(sep_in);
8174 if (!sep_obj) {
8175 Py_DECREF(str_obj);
8176 return NULL;
8177 }
8178
8179 out = stringlib_rpartition(
8180 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8181 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8182 );
8183
8184 Py_DECREF(sep_obj);
8185 Py_DECREF(str_obj);
8186
8187 return out;
8188}
8189
8190PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008192\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008193Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008194the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008195found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008196
8197static PyObject*
8198unicode_partition(PyUnicodeObject *self, PyObject *separator)
8199{
8200 return PyUnicode_Partition((PyObject *)self, separator);
8201}
8202
8203PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008205\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008206Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008207the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008208separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209
8210static PyObject*
8211unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8212{
8213 return PyUnicode_RPartition((PyObject *)self, separator);
8214}
8215
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008216PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 PyObject *sep,
8218 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008219{
8220 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008221
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008222 s = PyUnicode_FromObject(s);
8223 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008224 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 if (sep != NULL) {
8226 sep = PyUnicode_FromObject(sep);
8227 if (sep == NULL) {
8228 Py_DECREF(s);
8229 return NULL;
8230 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008231 }
8232
8233 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8234
8235 Py_DECREF(s);
8236 Py_XDECREF(sep);
8237 return result;
8238}
8239
8240PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008242\n\
8243Return a list of the words in S, using sep as the\n\
8244delimiter string, starting at the end of the string and\n\
8245working to the front. If maxsplit is given, at most maxsplit\n\
8246splits are done. If sep is not specified, any whitespace string\n\
8247is a separator.");
8248
8249static PyObject*
8250unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8251{
8252 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008253 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008254
Martin v. Löwis18e16552006-02-15 17:27:45 +00008255 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008256 return NULL;
8257
8258 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008260 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008262 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008264}
8265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008266PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268\n\
8269Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008270Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008271is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272
8273static PyObject*
8274unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8275{
Guido van Rossum86662912000-04-11 15:38:46 +00008276 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277
Guido van Rossum86662912000-04-11 15:38:46 +00008278 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 return NULL;
8280
Guido van Rossum86662912000-04-11 15:38:46 +00008281 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282}
8283
8284static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008285PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286{
Walter Dörwald346737f2007-05-31 10:44:43 +00008287 if (PyUnicode_CheckExact(self)) {
8288 Py_INCREF(self);
8289 return self;
8290 } else
8291 /* Subtype -- return genuine unicode string with the same value. */
8292 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8293 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294}
8295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008296PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298\n\
8299Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008300and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
8302static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008303unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 return fixup(self, fixswapcase);
8306}
8307
Georg Brandlceee0772007-11-27 23:48:05 +00008308PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008310\n\
8311Return a translation table usable for str.translate().\n\
8312If there is only one argument, it must be a dictionary mapping Unicode\n\
8313ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008314Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008315If there are two arguments, they must be strings of equal length, and\n\
8316in the resulting dictionary, each character in x will be mapped to the\n\
8317character at the same position in y. If there is a third argument, it\n\
8318must be a string, whose characters will be mapped to None in the result.");
8319
8320static PyObject*
8321unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8322{
8323 PyObject *x, *y = NULL, *z = NULL;
8324 PyObject *new = NULL, *key, *value;
8325 Py_ssize_t i = 0;
8326 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008327
Georg Brandlceee0772007-11-27 23:48:05 +00008328 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8329 return NULL;
8330 new = PyDict_New();
8331 if (!new)
8332 return NULL;
8333 if (y != NULL) {
8334 /* x must be a string too, of equal length */
8335 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8336 if (!PyUnicode_Check(x)) {
8337 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8338 "be a string if there is a second argument");
8339 goto err;
8340 }
8341 if (PyUnicode_GET_SIZE(x) != ylen) {
8342 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8343 "arguments must have equal length");
8344 goto err;
8345 }
8346 /* create entries for translating chars in x to those in y */
8347 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008348 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8349 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008350 if (!key || !value)
8351 goto err;
8352 res = PyDict_SetItem(new, key, value);
8353 Py_DECREF(key);
8354 Py_DECREF(value);
8355 if (res < 0)
8356 goto err;
8357 }
8358 /* create entries for deleting chars in z */
8359 if (z != NULL) {
8360 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008361 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008362 if (!key)
8363 goto err;
8364 res = PyDict_SetItem(new, key, Py_None);
8365 Py_DECREF(key);
8366 if (res < 0)
8367 goto err;
8368 }
8369 }
8370 } else {
8371 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008372 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008373 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8374 "to maketrans it must be a dict");
8375 goto err;
8376 }
8377 /* copy entries into the new dict, converting string keys to int keys */
8378 while (PyDict_Next(x, &i, &key, &value)) {
8379 if (PyUnicode_Check(key)) {
8380 /* convert string keys to integer keys */
8381 PyObject *newkey;
8382 if (PyUnicode_GET_SIZE(key) != 1) {
8383 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8384 "table must be of length 1");
8385 goto err;
8386 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008387 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008388 if (!newkey)
8389 goto err;
8390 res = PyDict_SetItem(new, newkey, value);
8391 Py_DECREF(newkey);
8392 if (res < 0)
8393 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008394 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008395 /* just keep integer keys */
8396 if (PyDict_SetItem(new, key, value) < 0)
8397 goto err;
8398 } else {
8399 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8400 "be strings or integers");
8401 goto err;
8402 }
8403 }
8404 }
8405 return new;
8406 err:
8407 Py_DECREF(new);
8408 return NULL;
8409}
8410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008411PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413\n\
8414Return a copy of the string S, where all characters have been mapped\n\
8415through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008416Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008417Unmapped characters are left untouched. Characters mapped to None\n\
8418are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419
8420static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008421unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422{
Georg Brandlceee0772007-11-27 23:48:05 +00008423 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424}
8425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008426PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008429Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430
8431static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008432unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 return fixup(self, fixupper);
8435}
8436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008437PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008440Pad a numeric string S with zeros on the left, to fill a field\n\
8441of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442
8443static PyObject *
8444unicode_zfill(PyUnicodeObject *self, PyObject *args)
8445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008446 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 PyUnicodeObject *u;
8448
Martin v. Löwis18e16552006-02-15 17:27:45 +00008449 Py_ssize_t width;
8450 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 return NULL;
8452
8453 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008454 if (PyUnicode_CheckExact(self)) {
8455 Py_INCREF(self);
8456 return (PyObject*) self;
8457 }
8458 else
8459 return PyUnicode_FromUnicode(
8460 PyUnicode_AS_UNICODE(self),
8461 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 }
8464
8465 fill = width - self->length;
8466
8467 u = pad(self, fill, 0, '0');
8468
Walter Dörwald068325e2002-04-15 13:36:47 +00008469 if (u == NULL)
8470 return NULL;
8471
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472 if (u->str[fill] == '+' || u->str[fill] == '-') {
8473 /* move sign to beginning of string */
8474 u->str[0] = u->str[fill];
8475 u->str[fill] = '0';
8476 }
8477
8478 return (PyObject*) u;
8479}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480
8481#if 0
8482static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008483unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484{
Christian Heimes2202f872008-02-06 14:31:34 +00008485 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486}
8487#endif
8488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008489PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008492Return True if S starts with the specified prefix, False otherwise.\n\
8493With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008494With optional end, stop comparing S at that position.\n\
8495prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496
8497static PyObject *
8498unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008501 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008503 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008504 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008505 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008507 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8509 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008510 if (PyTuple_Check(subobj)) {
8511 Py_ssize_t i;
8512 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8513 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008515 if (substring == NULL)
8516 return NULL;
8517 result = tailmatch(self, substring, start, end, -1);
8518 Py_DECREF(substring);
8519 if (result) {
8520 Py_RETURN_TRUE;
8521 }
8522 }
8523 /* nothing matched */
8524 Py_RETURN_FALSE;
8525 }
8526 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008529 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008531 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532}
8533
8534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008535PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008538Return True if S ends with the specified suffix, False otherwise.\n\
8539With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008540With optional end, stop comparing S at that position.\n\
8541suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
8543static PyObject *
8544unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008547 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008549 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008550 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008551 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008553 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8555 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008556 if (PyTuple_Check(subobj)) {
8557 Py_ssize_t i;
8558 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8559 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008561 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008563 result = tailmatch(self, substring, start, end, +1);
8564 Py_DECREF(substring);
8565 if (result) {
8566 Py_RETURN_TRUE;
8567 }
8568 }
8569 Py_RETURN_FALSE;
8570 }
8571 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008575 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008577 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578}
8579
Eric Smith8c663262007-08-25 02:26:07 +00008580#include "stringlib/string_format.h"
8581
8582PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008584\n\
8585");
8586
Eric Smith4a7d76d2008-05-30 18:10:19 +00008587static PyObject *
8588unicode__format__(PyObject* self, PyObject* args)
8589{
8590 PyObject *format_spec;
8591
8592 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8593 return NULL;
8594
8595 return _PyUnicode_FormatAdvanced(self,
8596 PyUnicode_AS_UNICODE(format_spec),
8597 PyUnicode_GET_SIZE(format_spec));
8598}
8599
Eric Smith8c663262007-08-25 02:26:07 +00008600PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008602\n\
8603");
8604
8605static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008606unicode__sizeof__(PyUnicodeObject *v)
8607{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008608 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8609 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008610}
8611
8612PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008614
8615static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008616unicode_getnewargs(PyUnicodeObject *v)
8617{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008618 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008619}
8620
8621
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622static PyMethodDef unicode_methods[] = {
8623
8624 /* Order is according to common usage: often used methods should
8625 appear first, since lookup is done sequentially. */
8626
Benjamin Peterson308d6372009-09-18 21:42:35 +00008627 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008628 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8629 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008630 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008631 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8632 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8633 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8634 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8635 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8636 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8637 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008638 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008639 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8640 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8641 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008642 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008643 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8644 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8645 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008646 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008647 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008648 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008649 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008650 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8651 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8652 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8653 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8654 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8655 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8656 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8657 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8658 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8659 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8660 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8661 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8662 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8663 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008664 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008665 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008666 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008667 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008668 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008669 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8670 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008671 {"maketrans", (PyCFunction) unicode_maketrans,
8672 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008673 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008674#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008675 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676#endif
8677
8678#if 0
8679 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008680 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681#endif
8682
Benjamin Peterson14339b62009-01-31 16:36:08 +00008683 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 {NULL, NULL}
8685};
8686
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008687static PyObject *
8688unicode_mod(PyObject *v, PyObject *w)
8689{
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 if (!PyUnicode_Check(v)) {
8691 Py_INCREF(Py_NotImplemented);
8692 return Py_NotImplemented;
8693 }
8694 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008695}
8696
8697static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008698 0, /*nb_add*/
8699 0, /*nb_subtract*/
8700 0, /*nb_multiply*/
8701 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008702};
8703
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008705 (lenfunc) unicode_length, /* sq_length */
8706 PyUnicode_Concat, /* sq_concat */
8707 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8708 (ssizeargfunc) unicode_getitem, /* sq_item */
8709 0, /* sq_slice */
8710 0, /* sq_ass_item */
8711 0, /* sq_ass_slice */
8712 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713};
8714
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008715static PyObject*
8716unicode_subscript(PyUnicodeObject* self, PyObject* item)
8717{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008718 if (PyIndex_Check(item)) {
8719 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008720 if (i == -1 && PyErr_Occurred())
8721 return NULL;
8722 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008723 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008724 return unicode_getitem(self, i);
8725 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008726 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008727 Py_UNICODE* source_buf;
8728 Py_UNICODE* result_buf;
8729 PyObject* result;
8730
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008731 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008733 return NULL;
8734 }
8735
8736 if (slicelength <= 0) {
8737 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008738 } else if (start == 0 && step == 1 && slicelength == self->length &&
8739 PyUnicode_CheckExact(self)) {
8740 Py_INCREF(self);
8741 return (PyObject *)self;
8742 } else if (step == 1) {
8743 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008744 } else {
8745 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008746 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8747 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008748
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 if (result_buf == NULL)
8750 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008751
8752 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8753 result_buf[i] = source_buf[cur];
8754 }
Tim Petersced69f82003-09-16 20:30:58 +00008755
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008756 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008757 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008758 return result;
8759 }
8760 } else {
8761 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8762 return NULL;
8763 }
8764}
8765
8766static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008767 (lenfunc)unicode_length, /* mp_length */
8768 (binaryfunc)unicode_subscript, /* mp_subscript */
8769 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008770};
8771
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773/* Helpers for PyUnicode_Format() */
8774
8775static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008776getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008778 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 (*p_argidx)++;
8781 if (arglen < 0)
8782 return args;
8783 else
8784 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 }
8786 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 return NULL;
8789}
8790
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008791/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008793static PyObject *
8794formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008796 char *p;
8797 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008799
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 x = PyFloat_AsDouble(v);
8801 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008802 return NULL;
8803
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008806
Eric Smith0923d1d2009-04-16 20:16:10 +00008807 p = PyOS_double_to_string(x, type, prec,
8808 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008809 if (p == NULL)
8810 return NULL;
8811 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008812 PyMem_Free(p);
8813 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814}
8815
Tim Peters38fd5b62000-09-21 05:43:11 +00008816static PyObject*
8817formatlong(PyObject *val, int flags, int prec, int type)
8818{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008819 char *buf;
8820 int len;
8821 PyObject *str; /* temporary string object. */
8822 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008823
Benjamin Peterson14339b62009-01-31 16:36:08 +00008824 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8825 if (!str)
8826 return NULL;
8827 result = PyUnicode_FromStringAndSize(buf, len);
8828 Py_DECREF(str);
8829 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008830}
8831
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832static int
8833formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008834 size_t buflen,
8835 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008837 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008838 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 if (PyUnicode_GET_SIZE(v) == 1) {
8840 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8841 buf[1] = '\0';
8842 return 1;
8843 }
8844#ifndef Py_UNICODE_WIDE
8845 if (PyUnicode_GET_SIZE(v) == 2) {
8846 /* Decode a valid surrogate pair */
8847 int c0 = PyUnicode_AS_UNICODE(v)[0];
8848 int c1 = PyUnicode_AS_UNICODE(v)[1];
8849 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8850 0xDC00 <= c1 && c1 <= 0xDFFF) {
8851 buf[0] = c0;
8852 buf[1] = c1;
8853 buf[2] = '\0';
8854 return 2;
8855 }
8856 }
8857#endif
8858 goto onError;
8859 }
8860 else {
8861 /* Integer input truncated to a character */
8862 long x;
8863 x = PyLong_AsLong(v);
8864 if (x == -1 && PyErr_Occurred())
8865 goto onError;
8866
8867 if (x < 0 || x > 0x10ffff) {
8868 PyErr_SetString(PyExc_OverflowError,
8869 "%c arg not in range(0x110000)");
8870 return -1;
8871 }
8872
8873#ifndef Py_UNICODE_WIDE
8874 if (x > 0xffff) {
8875 x -= 0x10000;
8876 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8877 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8878 return 2;
8879 }
8880#endif
8881 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008882 buf[1] = '\0';
8883 return 1;
8884 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008885
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008887 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008889 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890}
8891
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008892/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008893 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008894*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008895#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008896
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899{
8900 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008901 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902 int args_owned = 0;
8903 PyUnicodeObject *result = NULL;
8904 PyObject *dict = NULL;
8905 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008906
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 PyErr_BadInternalCall();
8909 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910 }
8911 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008912 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 fmt = PyUnicode_AS_UNICODE(uformat);
8915 fmtcnt = PyUnicode_GET_SIZE(uformat);
8916
8917 reslen = rescnt = fmtcnt + 100;
8918 result = _PyUnicode_New(reslen);
8919 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921 res = PyUnicode_AS_UNICODE(result);
8922
8923 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 arglen = PyTuple_Size(args);
8925 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926 }
8927 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008928 arglen = -1;
8929 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008931 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008932 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934
8935 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 if (*fmt != '%') {
8937 if (--rescnt < 0) {
8938 rescnt = fmtcnt + 100;
8939 reslen += rescnt;
8940 if (_PyUnicode_Resize(&result, reslen) < 0)
8941 goto onError;
8942 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8943 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008946 }
8947 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 /* Got a format specifier */
8949 int flags = 0;
8950 Py_ssize_t width = -1;
8951 int prec = -1;
8952 Py_UNICODE c = '\0';
8953 Py_UNICODE fill;
8954 int isnumok;
8955 PyObject *v = NULL;
8956 PyObject *temp = NULL;
8957 Py_UNICODE *pbuf;
8958 Py_UNICODE sign;
8959 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008960 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 fmt++;
8963 if (*fmt == '(') {
8964 Py_UNICODE *keystart;
8965 Py_ssize_t keylen;
8966 PyObject *key;
8967 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00008968
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 if (dict == NULL) {
8970 PyErr_SetString(PyExc_TypeError,
8971 "format requires a mapping");
8972 goto onError;
8973 }
8974 ++fmt;
8975 --fmtcnt;
8976 keystart = fmt;
8977 /* Skip over balanced parentheses */
8978 while (pcount > 0 && --fmtcnt >= 0) {
8979 if (*fmt == ')')
8980 --pcount;
8981 else if (*fmt == '(')
8982 ++pcount;
8983 fmt++;
8984 }
8985 keylen = fmt - keystart - 1;
8986 if (fmtcnt < 0 || pcount > 0) {
8987 PyErr_SetString(PyExc_ValueError,
8988 "incomplete format key");
8989 goto onError;
8990 }
8991#if 0
8992 /* keys are converted to strings using UTF-8 and
8993 then looked up since Python uses strings to hold
8994 variables names etc. in its namespaces and we
8995 wouldn't want to break common idioms. */
8996 key = PyUnicode_EncodeUTF8(keystart,
8997 keylen,
8998 NULL);
8999#else
9000 key = PyUnicode_FromUnicode(keystart, keylen);
9001#endif
9002 if (key == NULL)
9003 goto onError;
9004 if (args_owned) {
9005 Py_DECREF(args);
9006 args_owned = 0;
9007 }
9008 args = PyObject_GetItem(dict, key);
9009 Py_DECREF(key);
9010 if (args == NULL) {
9011 goto onError;
9012 }
9013 args_owned = 1;
9014 arglen = -1;
9015 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009016 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 while (--fmtcnt >= 0) {
9018 switch (c = *fmt++) {
9019 case '-': flags |= F_LJUST; continue;
9020 case '+': flags |= F_SIGN; continue;
9021 case ' ': flags |= F_BLANK; continue;
9022 case '#': flags |= F_ALT; continue;
9023 case '0': flags |= F_ZERO; continue;
9024 }
9025 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009026 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 if (c == '*') {
9028 v = getnextarg(args, arglen, &argidx);
9029 if (v == NULL)
9030 goto onError;
9031 if (!PyLong_Check(v)) {
9032 PyErr_SetString(PyExc_TypeError,
9033 "* wants int");
9034 goto onError;
9035 }
9036 width = PyLong_AsLong(v);
9037 if (width == -1 && PyErr_Occurred())
9038 goto onError;
9039 if (width < 0) {
9040 flags |= F_LJUST;
9041 width = -width;
9042 }
9043 if (--fmtcnt >= 0)
9044 c = *fmt++;
9045 }
9046 else if (c >= '0' && c <= '9') {
9047 width = c - '0';
9048 while (--fmtcnt >= 0) {
9049 c = *fmt++;
9050 if (c < '0' || c > '9')
9051 break;
9052 if ((width*10) / 10 != width) {
9053 PyErr_SetString(PyExc_ValueError,
9054 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009055 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 }
9057 width = width*10 + (c - '0');
9058 }
9059 }
9060 if (c == '.') {
9061 prec = 0;
9062 if (--fmtcnt >= 0)
9063 c = *fmt++;
9064 if (c == '*') {
9065 v = getnextarg(args, arglen, &argidx);
9066 if (v == NULL)
9067 goto onError;
9068 if (!PyLong_Check(v)) {
9069 PyErr_SetString(PyExc_TypeError,
9070 "* wants int");
9071 goto onError;
9072 }
9073 prec = PyLong_AsLong(v);
9074 if (prec == -1 && PyErr_Occurred())
9075 goto onError;
9076 if (prec < 0)
9077 prec = 0;
9078 if (--fmtcnt >= 0)
9079 c = *fmt++;
9080 }
9081 else if (c >= '0' && c <= '9') {
9082 prec = c - '0';
9083 while (--fmtcnt >= 0) {
9084 c = Py_CHARMASK(*fmt++);
9085 if (c < '0' || c > '9')
9086 break;
9087 if ((prec*10) / 10 != prec) {
9088 PyErr_SetString(PyExc_ValueError,
9089 "prec too big");
9090 goto onError;
9091 }
9092 prec = prec*10 + (c - '0');
9093 }
9094 }
9095 } /* prec */
9096 if (fmtcnt >= 0) {
9097 if (c == 'h' || c == 'l' || c == 'L') {
9098 if (--fmtcnt >= 0)
9099 c = *fmt++;
9100 }
9101 }
9102 if (fmtcnt < 0) {
9103 PyErr_SetString(PyExc_ValueError,
9104 "incomplete format");
9105 goto onError;
9106 }
9107 if (c != '%') {
9108 v = getnextarg(args, arglen, &argidx);
9109 if (v == NULL)
9110 goto onError;
9111 }
9112 sign = 0;
9113 fill = ' ';
9114 switch (c) {
9115
9116 case '%':
9117 pbuf = formatbuf;
9118 /* presume that buffer length is at least 1 */
9119 pbuf[0] = '%';
9120 len = 1;
9121 break;
9122
9123 case 's':
9124 case 'r':
9125 case 'a':
9126 if (PyUnicode_Check(v) && c == 's') {
9127 temp = v;
9128 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009129 }
9130 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 if (c == 's')
9132 temp = PyObject_Str(v);
9133 else if (c == 'r')
9134 temp = PyObject_Repr(v);
9135 else
9136 temp = PyObject_ASCII(v);
9137 if (temp == NULL)
9138 goto onError;
9139 if (PyUnicode_Check(temp))
9140 /* nothing to do */;
9141 else {
9142 Py_DECREF(temp);
9143 PyErr_SetString(PyExc_TypeError,
9144 "%s argument has non-string str()");
9145 goto onError;
9146 }
9147 }
9148 pbuf = PyUnicode_AS_UNICODE(temp);
9149 len = PyUnicode_GET_SIZE(temp);
9150 if (prec >= 0 && len > prec)
9151 len = prec;
9152 break;
9153
9154 case 'i':
9155 case 'd':
9156 case 'u':
9157 case 'o':
9158 case 'x':
9159 case 'X':
9160 if (c == 'i')
9161 c = 'd';
9162 isnumok = 0;
9163 if (PyNumber_Check(v)) {
9164 PyObject *iobj=NULL;
9165
9166 if (PyLong_Check(v)) {
9167 iobj = v;
9168 Py_INCREF(iobj);
9169 }
9170 else {
9171 iobj = PyNumber_Long(v);
9172 }
9173 if (iobj!=NULL) {
9174 if (PyLong_Check(iobj)) {
9175 isnumok = 1;
9176 temp = formatlong(iobj, flags, prec, c);
9177 Py_DECREF(iobj);
9178 if (!temp)
9179 goto onError;
9180 pbuf = PyUnicode_AS_UNICODE(temp);
9181 len = PyUnicode_GET_SIZE(temp);
9182 sign = 1;
9183 }
9184 else {
9185 Py_DECREF(iobj);
9186 }
9187 }
9188 }
9189 if (!isnumok) {
9190 PyErr_Format(PyExc_TypeError,
9191 "%%%c format: a number is required, "
9192 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9193 goto onError;
9194 }
9195 if (flags & F_ZERO)
9196 fill = '0';
9197 break;
9198
9199 case 'e':
9200 case 'E':
9201 case 'f':
9202 case 'F':
9203 case 'g':
9204 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009205 temp = formatfloat(v, flags, prec, c);
9206 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009208 pbuf = PyUnicode_AS_UNICODE(temp);
9209 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 sign = 1;
9211 if (flags & F_ZERO)
9212 fill = '0';
9213 break;
9214
9215 case 'c':
9216 pbuf = formatbuf;
9217 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9218 if (len < 0)
9219 goto onError;
9220 break;
9221
9222 default:
9223 PyErr_Format(PyExc_ValueError,
9224 "unsupported format character '%c' (0x%x) "
9225 "at index %zd",
9226 (31<=c && c<=126) ? (char)c : '?',
9227 (int)c,
9228 (Py_ssize_t)(fmt - 1 -
9229 PyUnicode_AS_UNICODE(uformat)));
9230 goto onError;
9231 }
9232 if (sign) {
9233 if (*pbuf == '-' || *pbuf == '+') {
9234 sign = *pbuf++;
9235 len--;
9236 }
9237 else if (flags & F_SIGN)
9238 sign = '+';
9239 else if (flags & F_BLANK)
9240 sign = ' ';
9241 else
9242 sign = 0;
9243 }
9244 if (width < len)
9245 width = len;
9246 if (rescnt - (sign != 0) < width) {
9247 reslen -= rescnt;
9248 rescnt = width + fmtcnt + 100;
9249 reslen += rescnt;
9250 if (reslen < 0) {
9251 Py_XDECREF(temp);
9252 PyErr_NoMemory();
9253 goto onError;
9254 }
9255 if (_PyUnicode_Resize(&result, reslen) < 0) {
9256 Py_XDECREF(temp);
9257 goto onError;
9258 }
9259 res = PyUnicode_AS_UNICODE(result)
9260 + reslen - rescnt;
9261 }
9262 if (sign) {
9263 if (fill != ' ')
9264 *res++ = sign;
9265 rescnt--;
9266 if (width > len)
9267 width--;
9268 }
9269 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9270 assert(pbuf[0] == '0');
9271 assert(pbuf[1] == c);
9272 if (fill != ' ') {
9273 *res++ = *pbuf++;
9274 *res++ = *pbuf++;
9275 }
9276 rescnt -= 2;
9277 width -= 2;
9278 if (width < 0)
9279 width = 0;
9280 len -= 2;
9281 }
9282 if (width > len && !(flags & F_LJUST)) {
9283 do {
9284 --rescnt;
9285 *res++ = fill;
9286 } while (--width > len);
9287 }
9288 if (fill == ' ') {
9289 if (sign)
9290 *res++ = sign;
9291 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9292 assert(pbuf[0] == '0');
9293 assert(pbuf[1] == c);
9294 *res++ = *pbuf++;
9295 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009296 }
9297 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009298 Py_UNICODE_COPY(res, pbuf, len);
9299 res += len;
9300 rescnt -= len;
9301 while (--width >= len) {
9302 --rescnt;
9303 *res++ = ' ';
9304 }
9305 if (dict && (argidx < arglen) && c != '%') {
9306 PyErr_SetString(PyExc_TypeError,
9307 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009308 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009309 goto onError;
9310 }
9311 Py_XDECREF(temp);
9312 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009313 } /* until end */
9314 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009315 PyErr_SetString(PyExc_TypeError,
9316 "not all arguments converted during string formatting");
9317 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 }
9319
Thomas Woutersa96affe2006-03-12 00:29:36 +00009320 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009321 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324 }
9325 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326 return (PyObject *)result;
9327
Benjamin Peterson29060642009-01-31 22:14:21 +00009328 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329 Py_XDECREF(result);
9330 Py_DECREF(uformat);
9331 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009332 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333 }
9334 return NULL;
9335}
9336
Jeremy Hylton938ace62002-07-17 16:30:39 +00009337static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009338unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9339
Tim Peters6d6c1a32001-08-02 04:15:00 +00009340static PyObject *
9341unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9342{
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009344 static char *kwlist[] = {"object", "encoding", "errors", 0};
9345 char *encoding = NULL;
9346 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009347
Benjamin Peterson14339b62009-01-31 16:36:08 +00009348 if (type != &PyUnicode_Type)
9349 return unicode_subtype_new(type, args, kwds);
9350 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009352 return NULL;
9353 if (x == NULL)
9354 return (PyObject *)_PyUnicode_New(0);
9355 if (encoding == NULL && errors == NULL)
9356 return PyObject_Str(x);
9357 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009358 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009359}
9360
Guido van Rossume023fe02001-08-30 03:12:59 +00009361static PyObject *
9362unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9363{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009364 PyUnicodeObject *tmp, *pnew;
9365 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009366
Benjamin Peterson14339b62009-01-31 16:36:08 +00009367 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9368 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9369 if (tmp == NULL)
9370 return NULL;
9371 assert(PyUnicode_Check(tmp));
9372 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9373 if (pnew == NULL) {
9374 Py_DECREF(tmp);
9375 return NULL;
9376 }
9377 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9378 if (pnew->str == NULL) {
9379 _Py_ForgetReference((PyObject *)pnew);
9380 PyObject_Del(pnew);
9381 Py_DECREF(tmp);
9382 return PyErr_NoMemory();
9383 }
9384 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9385 pnew->length = n;
9386 pnew->hash = tmp->hash;
9387 Py_DECREF(tmp);
9388 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009389}
9390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009391PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009393\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009394Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009395encoding defaults to the current default string encoding.\n\
9396errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009397
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009398static PyObject *unicode_iter(PyObject *seq);
9399
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009401 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009402 "str", /* tp_name */
9403 sizeof(PyUnicodeObject), /* tp_size */
9404 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009406 (destructor)unicode_dealloc, /* tp_dealloc */
9407 0, /* tp_print */
9408 0, /* tp_getattr */
9409 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009410 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009411 unicode_repr, /* tp_repr */
9412 &unicode_as_number, /* tp_as_number */
9413 &unicode_as_sequence, /* tp_as_sequence */
9414 &unicode_as_mapping, /* tp_as_mapping */
9415 (hashfunc) unicode_hash, /* tp_hash*/
9416 0, /* tp_call*/
9417 (reprfunc) unicode_str, /* tp_str */
9418 PyObject_GenericGetAttr, /* tp_getattro */
9419 0, /* tp_setattro */
9420 0, /* tp_as_buffer */
9421 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009422 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009423 unicode_doc, /* tp_doc */
9424 0, /* tp_traverse */
9425 0, /* tp_clear */
9426 PyUnicode_RichCompare, /* tp_richcompare */
9427 0, /* tp_weaklistoffset */
9428 unicode_iter, /* tp_iter */
9429 0, /* tp_iternext */
9430 unicode_methods, /* tp_methods */
9431 0, /* tp_members */
9432 0, /* tp_getset */
9433 &PyBaseObject_Type, /* tp_base */
9434 0, /* tp_dict */
9435 0, /* tp_descr_get */
9436 0, /* tp_descr_set */
9437 0, /* tp_dictoffset */
9438 0, /* tp_init */
9439 0, /* tp_alloc */
9440 unicode_new, /* tp_new */
9441 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442};
9443
9444/* Initialize the Unicode implementation */
9445
Thomas Wouters78890102000-07-22 19:25:51 +00009446void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009448 int i;
9449
Thomas Wouters477c8d52006-05-27 19:21:47 +00009450 /* XXX - move this array to unicodectype.c ? */
9451 Py_UNICODE linebreak[] = {
9452 0x000A, /* LINE FEED */
9453 0x000D, /* CARRIAGE RETURN */
9454 0x001C, /* FILE SEPARATOR */
9455 0x001D, /* GROUP SEPARATOR */
9456 0x001E, /* RECORD SEPARATOR */
9457 0x0085, /* NEXT LINE */
9458 0x2028, /* LINE SEPARATOR */
9459 0x2029, /* PARAGRAPH SEPARATOR */
9460 };
9461
Fred Drakee4315f52000-05-09 19:53:39 +00009462 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009463 free_list = NULL;
9464 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009466 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009468
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009469 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009471 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009473
9474 /* initialize the linebreak bloom filter */
9475 bloom_linebreak = make_bloom_mask(
9476 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9477 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009478
9479 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480}
9481
9482/* Finalize the Unicode implementation */
9483
Christian Heimesa156e092008-02-16 07:38:31 +00009484int
9485PyUnicode_ClearFreeList(void)
9486{
9487 int freelist_size = numfree;
9488 PyUnicodeObject *u;
9489
9490 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009491 PyUnicodeObject *v = u;
9492 u = *(PyUnicodeObject **)u;
9493 if (v->str)
9494 PyObject_DEL(v->str);
9495 Py_XDECREF(v->defenc);
9496 PyObject_Del(v);
9497 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009498 }
9499 free_list = NULL;
9500 assert(numfree == 0);
9501 return freelist_size;
9502}
9503
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504void
Thomas Wouters78890102000-07-22 19:25:51 +00009505_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009507 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009509 Py_XDECREF(unicode_empty);
9510 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009511
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009512 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 if (unicode_latin1[i]) {
9514 Py_DECREF(unicode_latin1[i]);
9515 unicode_latin1[i] = NULL;
9516 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009517 }
Christian Heimesa156e092008-02-16 07:38:31 +00009518 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009520
Walter Dörwald16807132007-05-25 13:52:07 +00009521void
9522PyUnicode_InternInPlace(PyObject **p)
9523{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009524 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9525 PyObject *t;
9526 if (s == NULL || !PyUnicode_Check(s))
9527 Py_FatalError(
9528 "PyUnicode_InternInPlace: unicode strings only please!");
9529 /* If it's a subclass, we don't really know what putting
9530 it in the interned dict might do. */
9531 if (!PyUnicode_CheckExact(s))
9532 return;
9533 if (PyUnicode_CHECK_INTERNED(s))
9534 return;
9535 if (interned == NULL) {
9536 interned = PyDict_New();
9537 if (interned == NULL) {
9538 PyErr_Clear(); /* Don't leave an exception */
9539 return;
9540 }
9541 }
9542 /* It might be that the GetItem call fails even
9543 though the key is present in the dictionary,
9544 namely when this happens during a stack overflow. */
9545 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009547 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009548
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 if (t) {
9550 Py_INCREF(t);
9551 Py_DECREF(*p);
9552 *p = t;
9553 return;
9554 }
Walter Dörwald16807132007-05-25 13:52:07 +00009555
Benjamin Peterson14339b62009-01-31 16:36:08 +00009556 PyThreadState_GET()->recursion_critical = 1;
9557 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9558 PyErr_Clear();
9559 PyThreadState_GET()->recursion_critical = 0;
9560 return;
9561 }
9562 PyThreadState_GET()->recursion_critical = 0;
9563 /* The two references in interned are not counted by refcnt.
9564 The deallocator will take care of this */
9565 Py_REFCNT(s) -= 2;
9566 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009567}
9568
9569void
9570PyUnicode_InternImmortal(PyObject **p)
9571{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009572 PyUnicode_InternInPlace(p);
9573 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9574 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9575 Py_INCREF(*p);
9576 }
Walter Dörwald16807132007-05-25 13:52:07 +00009577}
9578
9579PyObject *
9580PyUnicode_InternFromString(const char *cp)
9581{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009582 PyObject *s = PyUnicode_FromString(cp);
9583 if (s == NULL)
9584 return NULL;
9585 PyUnicode_InternInPlace(&s);
9586 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009587}
9588
9589void _Py_ReleaseInternedUnicodeStrings(void)
9590{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009591 PyObject *keys;
9592 PyUnicodeObject *s;
9593 Py_ssize_t i, n;
9594 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009595
Benjamin Peterson14339b62009-01-31 16:36:08 +00009596 if (interned == NULL || !PyDict_Check(interned))
9597 return;
9598 keys = PyDict_Keys(interned);
9599 if (keys == NULL || !PyList_Check(keys)) {
9600 PyErr_Clear();
9601 return;
9602 }
Walter Dörwald16807132007-05-25 13:52:07 +00009603
Benjamin Peterson14339b62009-01-31 16:36:08 +00009604 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9605 detector, interned unicode strings are not forcibly deallocated;
9606 rather, we give them their stolen references back, and then clear
9607 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009608
Benjamin Peterson14339b62009-01-31 16:36:08 +00009609 n = PyList_GET_SIZE(keys);
9610 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009612 for (i = 0; i < n; i++) {
9613 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9614 switch (s->state) {
9615 case SSTATE_NOT_INTERNED:
9616 /* XXX Shouldn't happen */
9617 break;
9618 case SSTATE_INTERNED_IMMORTAL:
9619 Py_REFCNT(s) += 1;
9620 immortal_size += s->length;
9621 break;
9622 case SSTATE_INTERNED_MORTAL:
9623 Py_REFCNT(s) += 2;
9624 mortal_size += s->length;
9625 break;
9626 default:
9627 Py_FatalError("Inconsistent interned string state.");
9628 }
9629 s->state = SSTATE_NOT_INTERNED;
9630 }
9631 fprintf(stderr, "total size of all interned strings: "
9632 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9633 "mortal/immortal\n", mortal_size, immortal_size);
9634 Py_DECREF(keys);
9635 PyDict_Clear(interned);
9636 Py_DECREF(interned);
9637 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009638}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009639
9640
9641/********************* Unicode Iterator **************************/
9642
9643typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009644 PyObject_HEAD
9645 Py_ssize_t it_index;
9646 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009647} unicodeiterobject;
9648
9649static void
9650unicodeiter_dealloc(unicodeiterobject *it)
9651{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009652 _PyObject_GC_UNTRACK(it);
9653 Py_XDECREF(it->it_seq);
9654 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009655}
9656
9657static int
9658unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9659{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009660 Py_VISIT(it->it_seq);
9661 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009662}
9663
9664static PyObject *
9665unicodeiter_next(unicodeiterobject *it)
9666{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009667 PyUnicodeObject *seq;
9668 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009669
Benjamin Peterson14339b62009-01-31 16:36:08 +00009670 assert(it != NULL);
9671 seq = it->it_seq;
9672 if (seq == NULL)
9673 return NULL;
9674 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009675
Benjamin Peterson14339b62009-01-31 16:36:08 +00009676 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9677 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009679 if (item != NULL)
9680 ++it->it_index;
9681 return item;
9682 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009683
Benjamin Peterson14339b62009-01-31 16:36:08 +00009684 Py_DECREF(seq);
9685 it->it_seq = NULL;
9686 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009687}
9688
9689static PyObject *
9690unicodeiter_len(unicodeiterobject *it)
9691{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009692 Py_ssize_t len = 0;
9693 if (it->it_seq)
9694 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9695 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009696}
9697
9698PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9699
9700static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009701 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009702 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009703 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009704};
9705
9706PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009707 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9708 "str_iterator", /* tp_name */
9709 sizeof(unicodeiterobject), /* tp_basicsize */
9710 0, /* tp_itemsize */
9711 /* methods */
9712 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9713 0, /* tp_print */
9714 0, /* tp_getattr */
9715 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009716 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009717 0, /* tp_repr */
9718 0, /* tp_as_number */
9719 0, /* tp_as_sequence */
9720 0, /* tp_as_mapping */
9721 0, /* tp_hash */
9722 0, /* tp_call */
9723 0, /* tp_str */
9724 PyObject_GenericGetAttr, /* tp_getattro */
9725 0, /* tp_setattro */
9726 0, /* tp_as_buffer */
9727 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9728 0, /* tp_doc */
9729 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9730 0, /* tp_clear */
9731 0, /* tp_richcompare */
9732 0, /* tp_weaklistoffset */
9733 PyObject_SelfIter, /* tp_iter */
9734 (iternextfunc)unicodeiter_next, /* tp_iternext */
9735 unicodeiter_methods, /* tp_methods */
9736 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009737};
9738
9739static PyObject *
9740unicode_iter(PyObject *seq)
9741{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009742 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009743
Benjamin Peterson14339b62009-01-31 16:36:08 +00009744 if (!PyUnicode_Check(seq)) {
9745 PyErr_BadInternalCall();
9746 return NULL;
9747 }
9748 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9749 if (it == NULL)
9750 return NULL;
9751 it->it_index = 0;
9752 Py_INCREF(seq);
9753 it->it_seq = (PyUnicodeObject *)seq;
9754 _PyObject_GC_TRACK(it);
9755 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009756}
9757
Martin v. Löwis5b222132007-06-10 09:51:05 +00009758size_t
9759Py_UNICODE_strlen(const Py_UNICODE *u)
9760{
9761 int res = 0;
9762 while(*u++)
9763 res++;
9764 return res;
9765}
9766
9767Py_UNICODE*
9768Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9769{
9770 Py_UNICODE *u = s1;
9771 while ((*u++ = *s2++));
9772 return s1;
9773}
9774
9775Py_UNICODE*
9776Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9777{
9778 Py_UNICODE *u = s1;
9779 while ((*u++ = *s2++))
9780 if (n-- == 0)
9781 break;
9782 return s1;
9783}
9784
9785int
9786Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9787{
9788 while (*s1 && *s2 && *s1 == *s2)
9789 s1++, s2++;
9790 if (*s1 && *s2)
9791 return (*s1 < *s2) ? -1 : +1;
9792 if (*s1)
9793 return 1;
9794 if (*s2)
9795 return -1;
9796 return 0;
9797}
9798
9799Py_UNICODE*
9800Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9801{
9802 const Py_UNICODE *p;
9803 for (p = s; *p; p++)
9804 if (*p == c)
9805 return (Py_UNICODE*)p;
9806 return NULL;
9807}
9808
9809
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009810#ifdef __cplusplus
9811}
9812#endif