blob: 341114f9431b9822c64b8232440dea8b90b653a7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner158701d2010-04-22 19:41:01 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
172/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000173 0, 0, 1, 0, 0, 1, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000175/* 0x001C, * FILE SEPARATOR */
176/* 0x001D, * GROUP SEPARATOR */
177/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 1, 1, 1, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000183
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000192};
193
194
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000195Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000196PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000198#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000200#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 /* This is actually an illegal character, so it should
202 not be passed to unichr. */
203 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000204#endif
205}
206
Thomas Wouters477c8d52006-05-27 19:21:47 +0000207/* --- Bloom Filters ----------------------------------------------------- */
208
209/* stuff to implement simple "bloom filters" for Unicode characters.
210 to keep things simple, we use a single bitmask, using the least 5
211 bits from each unicode characters as the bit index. */
212
213/* the linebreak mask is set up by Unicode_Init below */
214
215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
219#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
220
Benjamin Peterson29060642009-01-31 22:14:21 +0000221#define BLOOM_LINEBREAK(ch) \
222 ((ch) < 128U ? ascii_linebreak[(ch)] : \
223 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
225Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
226{
227 /* calculate simple bloom-style bitmask for a given unicode string */
228
229 long mask;
230 Py_ssize_t i;
231
232 mask = 0;
233 for (i = 0; i < len; i++)
234 mask |= (1 << (ptr[i] & 0x1F));
235
236 return mask;
237}
238
239Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
240{
241 Py_ssize_t i;
242
243 for (i = 0; i < setlen; i++)
244 if (set[i] == chr)
245 return 1;
246
247 return 0;
248}
249
Benjamin Peterson29060642009-01-31 22:14:21 +0000250#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
252
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253/* --- Unicode Object ----------------------------------------------------- */
254
255static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258{
259 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000260
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000261 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000263 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 /* Resizing shared object (unicode_empty or single character
266 objects) in-place is not allowed. Use PyUnicode_Resize()
267 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 (unicode->length == 1 &&
271 unicode->str[0] < 256U &&
272 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000274 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 return -1;
276 }
277
Thomas Wouters477c8d52006-05-27 19:21:47 +0000278 /* We allocate one more byte to make sure the string is Ux0000 terminated.
279 The overallocation is also used by fastsearch, which assumes that it's
280 safe to look at str[length] (without making any assumptions about what
281 it contains). */
282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000284 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000285 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000287 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 PyErr_NoMemory();
289 return -1;
290 }
291 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000292 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293
Benjamin Peterson29060642009-01-31 22:14:21 +0000294 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000296 if (unicode->defenc) {
Georg Brandl1fa11af2010-08-01 21:03:01 +0000297 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298 }
299 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000300
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 return 0;
302}
303
304/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000305 Ux0000 terminated; some code (e.g. new_identifier)
306 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307
308 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000309 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311*/
312
313static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000314PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315{
316 register PyUnicodeObject *unicode;
317
Thomas Wouters477c8d52006-05-27 19:21:47 +0000318 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 if (length == 0 && unicode_empty != NULL) {
320 Py_INCREF(unicode_empty);
321 return unicode_empty;
322 }
323
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000324 /* Ensure we won't overflow the size. */
325 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
326 return (PyUnicodeObject *)PyErr_NoMemory();
327 }
328
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000330 if (free_list) {
331 unicode = free_list;
332 free_list = *(PyUnicodeObject **)unicode;
333 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 if (unicode->str) {
335 /* Keep-Alive optimization: we only upsize the buffer,
336 never downsize it. */
337 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000338 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 PyObject_DEL(unicode->str);
340 unicode->str = NULL;
341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000343 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 }
347 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 }
349 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000350 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000351 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 if (unicode == NULL)
353 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
355 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 }
357
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000358 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 PyErr_NoMemory();
360 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000362 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000363 * the caller fails before initializing str -- unicode_resize()
364 * reads str[0], and the Keep-Alive optimization can keep memory
365 * allocated for str alive across a call to unicode_dealloc(unicode).
366 * We don't want unicode_resize to read uninitialized memory in
367 * that case.
368 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000369 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000371 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000373 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000374 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000376
Benjamin Peterson29060642009-01-31 22:14:21 +0000377 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000378 /* XXX UNREF/NEWREF interface should be more symmetrical */
379 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000381 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383}
384
385static
Guido van Rossum9475a232001-10-05 20:51:39 +0000386void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387{
Walter Dörwald16807132007-05-25 13:52:07 +0000388 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000389 case SSTATE_NOT_INTERNED:
390 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000391
Benjamin Peterson29060642009-01-31 22:14:21 +0000392 case SSTATE_INTERNED_MORTAL:
393 /* revive dead object temporarily for DelItem */
394 Py_REFCNT(unicode) = 3;
395 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
396 Py_FatalError(
397 "deletion of interned string failed");
398 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000399
Benjamin Peterson29060642009-01-31 22:14:21 +0000400 case SSTATE_INTERNED_IMMORTAL:
401 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000402
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 default:
404 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000405 }
406
Guido van Rossum604ddf82001-12-06 20:03:56 +0000407 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000409 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000410 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
411 PyObject_DEL(unicode->str);
412 unicode->str = NULL;
413 unicode->length = 0;
414 }
415 if (unicode->defenc) {
Georg Brandl1fa11af2010-08-01 21:03:01 +0000416 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 }
418 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000419 *(PyUnicodeObject **)unicode = free_list;
420 free_list = unicode;
421 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422 }
423 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 PyObject_DEL(unicode->str);
425 Py_XDECREF(unicode->defenc);
426 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428}
429
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000430static
431int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000432{
433 register PyUnicodeObject *v;
434
435 /* Argument checks */
436 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000437 PyErr_BadInternalCall();
438 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000439 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000440 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000441 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
445
446 /* Resizing unicode_empty and single character objects is not
447 possible since these are being shared. We simply return a fresh
448 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000449 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 (v == unicode_empty || v->length == 1)) {
451 PyUnicodeObject *w = _PyUnicode_New(length);
452 if (w == NULL)
453 return -1;
454 Py_UNICODE_COPY(w->str, v->str,
455 length < v->length ? length : v->length);
456 Py_DECREF(*unicode);
457 *unicode = w;
458 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Note that we don't have to modify *unicode for unshared Unicode
462 objects, since we can modify them in-place. */
463 return unicode_resize(v, length);
464}
465
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000466int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
467{
468 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
469}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000470
Guido van Rossumd57fd912000-03-10 22:53:23 +0000471PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000472 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000473{
474 PyUnicodeObject *unicode;
475
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476 /* If the Unicode data is known at construction time, we can apply
477 some optimizations which share commonly used objects. */
478 if (u != NULL) {
479
Benjamin Peterson29060642009-01-31 22:14:21 +0000480 /* Optimization for empty strings */
481 if (size == 0 && unicode_empty != NULL) {
482 Py_INCREF(unicode_empty);
483 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000484 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000485
486 /* Single character Unicode objects in the Latin-1 range are
487 shared when using this constructor */
488 if (size == 1 && *u < 256) {
489 unicode = unicode_latin1[*u];
490 if (!unicode) {
491 unicode = _PyUnicode_New(1);
492 if (!unicode)
493 return NULL;
494 unicode->str[0] = *u;
495 unicode_latin1[*u] = unicode;
496 }
497 Py_INCREF(unicode);
498 return (PyObject *)unicode;
499 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Guido van Rossumd57fd912000-03-10 22:53:23 +0000502 unicode = _PyUnicode_New(size);
503 if (!unicode)
504 return NULL;
505
506 /* Copy the Unicode data into the new object */
507 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000508 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509
510 return (PyObject *)unicode;
511}
512
Walter Dörwaldd2034312007-05-18 16:29:38 +0000513PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000514{
515 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000516
Benjamin Peterson14339b62009-01-31 16:36:08 +0000517 if (size < 0) {
518 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000519 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000520 return NULL;
521 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000522
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000524 some optimizations which share commonly used objects.
525 Also, this means the input must be UTF-8, so fall back to the
526 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 if (u != NULL) {
528
Benjamin Peterson29060642009-01-31 22:14:21 +0000529 /* Optimization for empty strings */
530 if (size == 0 && unicode_empty != NULL) {
531 Py_INCREF(unicode_empty);
532 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000533 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000534
535 /* Single characters are shared when using this constructor.
536 Restrict to ASCII, since the input must be UTF-8. */
537 if (size == 1 && Py_CHARMASK(*u) < 128) {
538 unicode = unicode_latin1[Py_CHARMASK(*u)];
539 if (!unicode) {
540 unicode = _PyUnicode_New(1);
541 if (!unicode)
542 return NULL;
543 unicode->str[0] = Py_CHARMASK(*u);
544 unicode_latin1[Py_CHARMASK(*u)] = unicode;
545 }
546 Py_INCREF(unicode);
547 return (PyObject *)unicode;
548 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000549
550 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000551 }
552
Walter Dörwald55507312007-05-18 13:12:10 +0000553 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000554 if (!unicode)
555 return NULL;
556
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000557 return (PyObject *)unicode;
558}
559
Walter Dörwaldd2034312007-05-18 16:29:38 +0000560PyObject *PyUnicode_FromString(const char *u)
561{
562 size_t size = strlen(u);
563 if (size > PY_SSIZE_T_MAX) {
564 PyErr_SetString(PyExc_OverflowError, "input too long");
565 return NULL;
566 }
567
568 return PyUnicode_FromStringAndSize(u, size);
569}
570
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571#ifdef HAVE_WCHAR_H
572
Mark Dickinson081dfee2009-03-18 14:47:41 +0000573#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
574# define CONVERT_WCHAR_TO_SURROGATES
575#endif
576
577#ifdef CONVERT_WCHAR_TO_SURROGATES
578
579/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
580 to convert from UTF32 to UTF16. */
581
582PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
583 Py_ssize_t size)
584{
585 PyUnicodeObject *unicode;
586 register Py_ssize_t i;
587 Py_ssize_t alloc;
588 const wchar_t *orig_w;
589
590 if (w == NULL) {
591 if (size == 0)
592 return PyUnicode_FromStringAndSize(NULL, 0);
593 PyErr_BadInternalCall();
594 return NULL;
595 }
596
597 if (size == -1) {
598 size = wcslen(w);
599 }
600
601 alloc = size;
602 orig_w = w;
603 for (i = size; i > 0; i--) {
604 if (*w > 0xFFFF)
605 alloc++;
606 w++;
607 }
608 w = orig_w;
609 unicode = _PyUnicode_New(alloc);
610 if (!unicode)
611 return NULL;
612
613 /* Copy the wchar_t data into the new object */
614 {
615 register Py_UNICODE *u;
616 u = PyUnicode_AS_UNICODE(unicode);
617 for (i = size; i > 0; i--) {
618 if (*w > 0xFFFF) {
619 wchar_t ordinal = *w++;
620 ordinal -= 0x10000;
621 *u++ = 0xD800 | (ordinal >> 10);
622 *u++ = 0xDC00 | (ordinal & 0x3FF);
623 }
624 else
625 *u++ = *w++;
626 }
627 }
628 return (PyObject *)unicode;
629}
630
631#else
632
Guido van Rossumd57fd912000-03-10 22:53:23 +0000633PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000634 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635{
636 PyUnicodeObject *unicode;
637
638 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000639 if (size == 0)
640 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000641 PyErr_BadInternalCall();
642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643 }
644
Martin v. Löwis790465f2008-04-05 20:41:37 +0000645 if (size == -1) {
646 size = wcslen(w);
647 }
648
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649 unicode = _PyUnicode_New(size);
650 if (!unicode)
651 return NULL;
652
653 /* Copy the wchar_t data into the new object */
654#ifdef HAVE_USABLE_WCHAR_T
655 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000656#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000658 register Py_UNICODE *u;
659 register Py_ssize_t i;
660 u = PyUnicode_AS_UNICODE(unicode);
661 for (i = size; i > 0; i--)
662 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 }
664#endif
665
666 return (PyObject *)unicode;
667}
668
Mark Dickinson081dfee2009-03-18 14:47:41 +0000669#endif /* CONVERT_WCHAR_TO_SURROGATES */
670
671#undef CONVERT_WCHAR_TO_SURROGATES
672
Walter Dörwald346737f2007-05-31 10:44:43 +0000673static void
674makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
675{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000676 *fmt++ = '%';
677 if (width) {
678 if (zeropad)
679 *fmt++ = '0';
680 fmt += sprintf(fmt, "%d", width);
681 }
682 if (precision)
683 fmt += sprintf(fmt, ".%d", precision);
684 if (longflag)
685 *fmt++ = 'l';
686 else if (size_tflag) {
687 char *f = PY_FORMAT_SIZE_T;
688 while (*f)
689 *fmt++ = *f++;
690 }
691 *fmt++ = c;
692 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000693}
694
Walter Dörwaldd2034312007-05-18 16:29:38 +0000695#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
696
697PyObject *
698PyUnicode_FromFormatV(const char *format, va_list vargs)
699{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000700 va_list count;
701 Py_ssize_t callcount = 0;
702 PyObject **callresults = NULL;
703 PyObject **callresult = NULL;
704 Py_ssize_t n = 0;
705 int width = 0;
706 int precision = 0;
707 int zeropad;
708 const char* f;
709 Py_UNICODE *s;
710 PyObject *string;
711 /* used by sprintf */
712 char buffer[21];
713 /* use abuffer instead of buffer, if we need more space
714 * (which can happen if there's a format specifier with width). */
715 char *abuffer = NULL;
716 char *realbuffer;
717 Py_ssize_t abuffersize = 0;
718 char fmt[60]; /* should be enough for %0width.precisionld */
719 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000720
721#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000722 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723#else
724#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000727 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728#endif
729#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000730 /* step 1: count the number of %S/%R/%A/%s format specifications
731 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
732 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
733 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000734 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000735 if (*f == '%') {
736 if (*(f+1)=='%')
737 continue;
738 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
739 ++callcount;
740 while (ISDIGIT((unsigned)*f))
741 width = (width*10) + *f++ - '0';
742 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
743 ;
744 if (*f == 's')
745 ++callcount;
746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 }
748 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000749 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000750 if (callcount) {
751 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
752 if (!callresults) {
753 PyErr_NoMemory();
754 return NULL;
755 }
756 callresult = callresults;
757 }
758 /* step 3: figure out how large a buffer we need */
759 for (f = format; *f; f++) {
760 if (*f == '%') {
761 const char* p = f;
762 width = 0;
763 while (ISDIGIT((unsigned)*f))
764 width = (width*10) + *f++ - '0';
765 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
766 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767
Benjamin Peterson14339b62009-01-31 16:36:08 +0000768 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
769 * they don't affect the amount of space we reserve.
770 */
771 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000772 (f[1] == 'd' || f[1] == 'u'))
773 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000774
Benjamin Peterson14339b62009-01-31 16:36:08 +0000775 switch (*f) {
776 case 'c':
777 (void)va_arg(count, int);
778 /* fall through... */
779 case '%':
780 n++;
781 break;
782 case 'd': case 'u': case 'i': case 'x':
783 (void) va_arg(count, int);
784 /* 20 bytes is enough to hold a 64-bit
785 integer. Decimal takes the most space.
786 This isn't enough for octal.
787 If a width is specified we need more
788 (which we allocate later). */
789 if (width < 20)
790 width = 20;
791 n += width;
792 if (abuffersize < width)
793 abuffersize = width;
794 break;
795 case 's':
796 {
797 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000798 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000799 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
800 if (!str)
801 goto fail;
802 n += PyUnicode_GET_SIZE(str);
803 /* Remember the str and switch to the next slot */
804 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000805 break;
806 }
807 case 'U':
808 {
809 PyObject *obj = va_arg(count, PyObject *);
810 assert(obj && PyUnicode_Check(obj));
811 n += PyUnicode_GET_SIZE(obj);
812 break;
813 }
814 case 'V':
815 {
816 PyObject *obj = va_arg(count, PyObject *);
817 const char *str = va_arg(count, const char *);
818 assert(obj || str);
819 assert(!obj || PyUnicode_Check(obj));
820 if (obj)
821 n += PyUnicode_GET_SIZE(obj);
822 else
823 n += strlen(str);
824 break;
825 }
826 case 'S':
827 {
828 PyObject *obj = va_arg(count, PyObject *);
829 PyObject *str;
830 assert(obj);
831 str = PyObject_Str(obj);
832 if (!str)
833 goto fail;
834 n += PyUnicode_GET_SIZE(str);
835 /* Remember the str and switch to the next slot */
836 *callresult++ = str;
837 break;
838 }
839 case 'R':
840 {
841 PyObject *obj = va_arg(count, PyObject *);
842 PyObject *repr;
843 assert(obj);
844 repr = PyObject_Repr(obj);
845 if (!repr)
846 goto fail;
847 n += PyUnicode_GET_SIZE(repr);
848 /* Remember the repr and switch to the next slot */
849 *callresult++ = repr;
850 break;
851 }
852 case 'A':
853 {
854 PyObject *obj = va_arg(count, PyObject *);
855 PyObject *ascii;
856 assert(obj);
857 ascii = PyObject_ASCII(obj);
858 if (!ascii)
859 goto fail;
860 n += PyUnicode_GET_SIZE(ascii);
861 /* Remember the repr and switch to the next slot */
862 *callresult++ = ascii;
863 break;
864 }
865 case 'p':
866 (void) va_arg(count, int);
867 /* maximum 64-bit pointer representation:
868 * 0xffffffffffffffff
869 * so 19 characters is enough.
870 * XXX I count 18 -- what's the extra for?
871 */
872 n += 19;
873 break;
874 default:
875 /* if we stumble upon an unknown
876 formatting code, copy the rest of
877 the format string to the output
878 string. (we cannot just skip the
879 code, since there's no way to know
880 what's in the argument list) */
881 n += strlen(p);
882 goto expand;
883 }
884 } else
885 n++;
886 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000887 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000888 if (abuffersize > 20) {
889 abuffer = PyObject_Malloc(abuffersize);
890 if (!abuffer) {
891 PyErr_NoMemory();
892 goto fail;
893 }
894 realbuffer = abuffer;
895 }
896 else
897 realbuffer = buffer;
898 /* step 4: fill the buffer */
899 /* Since we've analyzed how much space we need for the worst case,
900 we don't have to resize the string.
901 There can be no errors beyond this point. */
902 string = PyUnicode_FromUnicode(NULL, n);
903 if (!string)
904 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000905
Benjamin Peterson14339b62009-01-31 16:36:08 +0000906 s = PyUnicode_AS_UNICODE(string);
907 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000908
Benjamin Peterson14339b62009-01-31 16:36:08 +0000909 for (f = format; *f; f++) {
910 if (*f == '%') {
911 const char* p = f++;
912 int longflag = 0;
913 int size_tflag = 0;
914 zeropad = (*f == '0');
915 /* parse the width.precision part */
916 width = 0;
917 while (ISDIGIT((unsigned)*f))
918 width = (width*10) + *f++ - '0';
919 precision = 0;
920 if (*f == '.') {
921 f++;
922 while (ISDIGIT((unsigned)*f))
923 precision = (precision*10) + *f++ - '0';
924 }
925 /* handle the long flag, but only for %ld and %lu.
926 others can be added when necessary. */
927 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
928 longflag = 1;
929 ++f;
930 }
931 /* handle the size_t flag. */
932 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
933 size_tflag = 1;
934 ++f;
935 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000936
Benjamin Peterson14339b62009-01-31 16:36:08 +0000937 switch (*f) {
938 case 'c':
939 *s++ = va_arg(vargs, int);
940 break;
941 case 'd':
942 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
943 if (longflag)
944 sprintf(realbuffer, fmt, va_arg(vargs, long));
945 else if (size_tflag)
946 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
947 else
948 sprintf(realbuffer, fmt, va_arg(vargs, int));
949 appendstring(realbuffer);
950 break;
951 case 'u':
952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
953 if (longflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
955 else if (size_tflag)
956 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
957 else
958 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
959 appendstring(realbuffer);
960 break;
961 case 'i':
962 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
963 sprintf(realbuffer, fmt, va_arg(vargs, int));
964 appendstring(realbuffer);
965 break;
966 case 'x':
967 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
968 sprintf(realbuffer, fmt, va_arg(vargs, int));
969 appendstring(realbuffer);
970 break;
971 case 's':
972 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000973 /* unused, since we already have the result */
974 (void) va_arg(vargs, char *);
975 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
976 PyUnicode_GET_SIZE(*callresult));
977 s += PyUnicode_GET_SIZE(*callresult);
978 /* We're done with the unicode()/repr() => forget it */
979 Py_DECREF(*callresult);
980 /* switch to next unicode()/repr() result */
981 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 break;
983 }
984 case 'U':
985 {
986 PyObject *obj = va_arg(vargs, PyObject *);
987 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
988 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
989 s += size;
990 break;
991 }
992 case 'V':
993 {
994 PyObject *obj = va_arg(vargs, PyObject *);
995 const char *str = va_arg(vargs, const char *);
996 if (obj) {
997 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999 s += size;
1000 } else {
1001 appendstring(str);
1002 }
1003 break;
1004 }
1005 case 'S':
1006 case 'R':
1007 {
1008 Py_UNICODE *ucopy;
1009 Py_ssize_t usize;
1010 Py_ssize_t upos;
1011 /* unused, since we already have the result */
1012 (void) va_arg(vargs, PyObject *);
1013 ucopy = PyUnicode_AS_UNICODE(*callresult);
1014 usize = PyUnicode_GET_SIZE(*callresult);
1015 for (upos = 0; upos<usize;)
1016 *s++ = ucopy[upos++];
1017 /* We're done with the unicode()/repr() => forget it */
1018 Py_DECREF(*callresult);
1019 /* switch to next unicode()/repr() result */
1020 ++callresult;
1021 break;
1022 }
1023 case 'p':
1024 sprintf(buffer, "%p", va_arg(vargs, void*));
1025 /* %p is ill-defined: ensure leading 0x. */
1026 if (buffer[1] == 'X')
1027 buffer[1] = 'x';
1028 else if (buffer[1] != 'x') {
1029 memmove(buffer+2, buffer, strlen(buffer)+1);
1030 buffer[0] = '0';
1031 buffer[1] = 'x';
1032 }
1033 appendstring(buffer);
1034 break;
1035 case '%':
1036 *s++ = '%';
1037 break;
1038 default:
1039 appendstring(p);
1040 goto end;
1041 }
1042 } else
1043 *s++ = *f;
1044 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001045
Benjamin Peterson29060642009-01-31 22:14:21 +00001046 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001047 if (callresults)
1048 PyObject_Free(callresults);
1049 if (abuffer)
1050 PyObject_Free(abuffer);
1051 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1052 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001053 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001054 if (callresults) {
1055 PyObject **callresult2 = callresults;
1056 while (callresult2 < callresult) {
1057 Py_DECREF(*callresult2);
1058 ++callresult2;
1059 }
1060 PyObject_Free(callresults);
1061 }
1062 if (abuffer)
1063 PyObject_Free(abuffer);
1064 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001065}
1066
1067#undef appendstring
1068
1069PyObject *
1070PyUnicode_FromFormat(const char *format, ...)
1071{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 PyObject* ret;
1073 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001074
1075#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001077#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001080 ret = PyUnicode_FromFormatV(format, vargs);
1081 va_end(vargs);
1082 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001083}
1084
Martin v. Löwis18e16552006-02-15 17:27:45 +00001085Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 wchar_t *w,
1087 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088{
1089 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001090 PyErr_BadInternalCall();
1091 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001093
1094 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001096 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001097
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098#ifdef HAVE_USABLE_WCHAR_T
1099 memcpy(w, unicode->str, size * sizeof(wchar_t));
1100#else
1101 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001102 register Py_UNICODE *u;
1103 register Py_ssize_t i;
1104 u = PyUnicode_AS_UNICODE(unicode);
1105 for (i = size; i > 0; i--)
1106 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107 }
1108#endif
1109
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001110 if (size > PyUnicode_GET_SIZE(unicode))
1111 return PyUnicode_GET_SIZE(unicode);
1112 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001113 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114}
1115
1116#endif
1117
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001118PyObject *PyUnicode_FromOrdinal(int ordinal)
1119{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001120 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001121
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_SetString(PyExc_ValueError,
1124 "chr() arg not in range(0x110000)");
1125 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001126 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001127
1128#ifndef Py_UNICODE_WIDE
1129 if (ordinal > 0xffff) {
1130 ordinal -= 0x10000;
1131 s[0] = 0xD800 | (ordinal >> 10);
1132 s[1] = 0xDC00 | (ordinal & 0x3FF);
1133 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001134 }
1135#endif
1136
Hye-Shik Chang40574832004-04-06 07:24:51 +00001137 s[0] = (Py_UNICODE)ordinal;
1138 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001139}
1140
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141PyObject *PyUnicode_FromObject(register PyObject *obj)
1142{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001143 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001144 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001145 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 Py_INCREF(obj);
1147 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001148 }
1149 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001150 /* For a Unicode subtype that's not a Unicode object,
1151 return a true Unicode object with the same data. */
1152 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1153 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001154 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001155 PyErr_Format(PyExc_TypeError,
1156 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001157 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001158 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001159}
1160
1161PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 const char *encoding,
1163 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164{
Antoine Pitroua2983c62010-09-01 15:16:41 +00001165 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001167
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001169 PyErr_BadInternalCall();
1170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172
Antoine Pitroua2983c62010-09-01 15:16:41 +00001173 /* Decoding bytes objects is the most common case and should be fast */
1174 if (PyBytes_Check(obj)) {
1175 if (PyBytes_GET_SIZE(obj) == 0) {
1176 Py_INCREF(unicode_empty);
1177 v = (PyObject *) unicode_empty;
1178 }
1179 else {
1180 v = PyUnicode_Decode(
1181 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1182 encoding, errors);
1183 }
1184 return v;
1185 }
1186
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001187 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001188 PyErr_SetString(PyExc_TypeError,
1189 "decoding str is not supported");
1190 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001192
Antoine Pitroua2983c62010-09-01 15:16:41 +00001193 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1194 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1195 PyErr_Format(PyExc_TypeError,
1196 "coercing to str: need bytes, bytearray "
1197 "or buffer-like object, %.80s found",
1198 Py_TYPE(obj)->tp_name);
1199 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201
Antoine Pitroua2983c62010-09-01 15:16:41 +00001202 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001203 Py_INCREF(unicode_empty);
Antoine Pitroua2983c62010-09-01 15:16:41 +00001204 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 }
Tim Petersced69f82003-09-16 20:30:58 +00001206 else
Antoine Pitroua2983c62010-09-01 15:16:41 +00001207 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001208
Antoine Pitroua2983c62010-09-01 15:16:41 +00001209 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001210 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211}
1212
1213PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001214 Py_ssize_t size,
1215 const char *encoding,
1216 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217{
1218 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001219 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001220 char lower[20]; /* Enough for any encoding name we recognize */
1221 char *l;
1222 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223
1224 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001225 encoding = PyUnicode_GetDefaultEncoding();
1226
1227 /* Convert encoding to lower case and replace '_' with '-' in order to
1228 catch e.g. UTF_8 */
1229 e = encoding;
1230 l = lower;
1231 while (*e && l < &lower[(sizeof lower) - 2]) {
1232 if (ISUPPER(*e)) {
1233 *l++ = TOLOWER(*e++);
1234 }
1235 else if (*e == '_') {
1236 *l++ = '-';
1237 e++;
1238 }
1239 else {
1240 *l++ = *e++;
1241 }
1242 }
1243 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001244
1245 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001246 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001248 else if ((strcmp(lower, "latin-1") == 0) ||
1249 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001250 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001251#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001252 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001253 return PyUnicode_DecodeMBCS(s, size, errors);
1254#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001255 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001256 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001257 else if (strcmp(lower, "utf-16") == 0)
1258 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1259 else if (strcmp(lower, "utf-32") == 0)
1260 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261
1262 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001263 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001264 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001265 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001266 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 if (buffer == NULL)
1268 goto onError;
1269 unicode = PyCodec_Decode(buffer, encoding, errors);
1270 if (unicode == NULL)
1271 goto onError;
1272 if (!PyUnicode_Check(unicode)) {
1273 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001274 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001275 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_DECREF(unicode);
1277 goto onError;
1278 }
1279 Py_DECREF(buffer);
1280 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001281
Benjamin Peterson29060642009-01-31 22:14:21 +00001282 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 Py_XDECREF(buffer);
1284 return NULL;
1285}
1286
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001287PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v;
1292
1293 if (!PyUnicode_Check(unicode)) {
1294 PyErr_BadArgument();
1295 goto onError;
1296 }
1297
1298 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001299 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001300
1301 /* Decode via the codec registry */
1302 v = PyCodec_Decode(unicode, encoding, errors);
1303 if (v == NULL)
1304 goto onError;
1305 return v;
1306
Benjamin Peterson29060642009-01-31 22:14:21 +00001307 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001308 return NULL;
1309}
1310
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001311PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1312 const char *encoding,
1313 const char *errors)
1314{
1315 PyObject *v;
1316
1317 if (!PyUnicode_Check(unicode)) {
1318 PyErr_BadArgument();
1319 goto onError;
1320 }
1321
1322 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001323 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001324
1325 /* Decode via the codec registry */
1326 v = PyCodec_Decode(unicode, encoding, errors);
1327 if (v == NULL)
1328 goto onError;
1329 if (!PyUnicode_Check(v)) {
1330 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001331 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001332 Py_TYPE(v)->tp_name);
1333 Py_DECREF(v);
1334 goto onError;
1335 }
1336 return v;
1337
Benjamin Peterson29060642009-01-31 22:14:21 +00001338 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001339 return NULL;
1340}
1341
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001343 Py_ssize_t size,
1344 const char *encoding,
1345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346{
1347 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 unicode = PyUnicode_FromUnicode(s, size);
1350 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1353 Py_DECREF(unicode);
1354 return v;
1355}
1356
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001357PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1358 const char *encoding,
1359 const char *errors)
1360{
1361 PyObject *v;
1362
1363 if (!PyUnicode_Check(unicode)) {
1364 PyErr_BadArgument();
1365 goto onError;
1366 }
1367
1368 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001369 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001370
1371 /* Encode via the codec registry */
1372 v = PyCodec_Encode(unicode, encoding, errors);
1373 if (v == NULL)
1374 goto onError;
1375 return v;
1376
Benjamin Peterson29060642009-01-31 22:14:21 +00001377 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001378 return NULL;
1379}
1380
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1382 const char *encoding,
1383 const char *errors)
1384{
1385 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001386
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 if (!PyUnicode_Check(unicode)) {
1388 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 }
Fred Drakee4315f52000-05-09 19:53:39 +00001391
Tim Petersced69f82003-09-16 20:30:58 +00001392 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001393 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001394
1395 /* Shortcuts for common default encodings */
1396 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 if (strcmp(encoding, "utf-8") == 0)
1398 return PyUnicode_AsUTF8String(unicode);
1399 else if (strcmp(encoding, "latin-1") == 0)
1400 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001401#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 else if (strcmp(encoding, "mbcs") == 0)
1403 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001404#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 else if (strcmp(encoding, "ascii") == 0)
1406 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001407 /* During bootstrap, we may need to find the encodings
1408 package, to load the file system encoding, and require the
1409 file system encoding in order to load the encodings
1410 package.
1411
1412 Break out of this dependency by assuming that the path to
1413 the encodings module is ASCII-only. XXX could try wcstombs
1414 instead, if the file system encoding is the locale's
1415 encoding. */
1416 else if (Py_FileSystemDefaultEncoding &&
1417 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1418 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001419 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421
1422 /* Encode via the codec registry */
1423 v = PyCodec_Encode(unicode, encoding, errors);
1424 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001425 return NULL;
1426
1427 /* The normal path */
1428 if (PyBytes_Check(v))
1429 return v;
1430
1431 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 if (PyByteArray_Check(v)) {
1433 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001434 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001435 PyOS_snprintf(msg, sizeof(msg),
1436 "encoder %s returned buffer instead of bytes",
1437 encoding);
1438 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001439 Py_DECREF(v);
1440 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001441 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001442
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001443 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1444 Py_DECREF(v);
1445 return b;
1446 }
1447
1448 PyErr_Format(PyExc_TypeError,
1449 "encoder did not return a bytes object (type=%.400s)",
1450 Py_TYPE(v)->tp_name);
1451 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001452 return NULL;
1453}
1454
1455PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1456 const char *encoding,
1457 const char *errors)
1458{
1459 PyObject *v;
1460
1461 if (!PyUnicode_Check(unicode)) {
1462 PyErr_BadArgument();
1463 goto onError;
1464 }
1465
1466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001468
1469 /* Encode via the codec registry */
1470 v = PyCodec_Encode(unicode, encoding, errors);
1471 if (v == NULL)
1472 goto onError;
1473 if (!PyUnicode_Check(v)) {
1474 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001475 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001476 Py_TYPE(v)->tp_name);
1477 Py_DECREF(v);
1478 goto onError;
1479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483 return NULL;
1484}
1485
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001486PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001488{
1489 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001490 if (v)
1491 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001492 if (errors != NULL)
1493 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001494 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001495 PyUnicode_GET_SIZE(unicode),
1496 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001497 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001498 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001499 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001500 return v;
1501}
1502
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001503PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001504PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001505 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001506 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1507}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001508
Christian Heimes5894ba72007-11-04 11:43:14 +00001509PyObject*
1510PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1511{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001512 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1513 can be undefined. If it is case, decode using UTF-8. The following assumes
1514 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1515 bootstrapping process where the codecs aren't ready yet.
1516 */
1517 if (Py_FileSystemDefaultEncoding) {
1518#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001519 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001520 return PyUnicode_DecodeMBCS(s, size, "replace");
1521 }
1522#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001523 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001524 return PyUnicode_DecodeUTF8(s, size, "replace");
1525 }
1526#endif
1527 return PyUnicode_Decode(s, size,
1528 Py_FileSystemDefaultEncoding,
1529 "replace");
1530 }
1531 else {
1532 return PyUnicode_DecodeUTF8(s, size, "replace");
1533 }
1534}
1535
Martin v. Löwis011e8422009-05-05 04:43:17 +00001536/* Convert the argument to a bytes object, according to the file
1537 system encoding */
1538
1539int
1540PyUnicode_FSConverter(PyObject* arg, void* addr)
1541{
1542 PyObject *output = NULL;
1543 Py_ssize_t size;
1544 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001545 if (arg == NULL) {
1546 Py_DECREF(*(PyObject**)addr);
1547 return 1;
1548 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001549 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1550 output = arg;
1551 Py_INCREF(output);
1552 }
1553 else {
1554 arg = PyUnicode_FromObject(arg);
1555 if (!arg)
1556 return 0;
Ezio Melottiba42fd52011-04-26 06:09:45 +03001557 output = PyUnicode_AsEncodedObject(arg,
Martin v. Löwis011e8422009-05-05 04:43:17 +00001558 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001559 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001560 Py_DECREF(arg);
1561 if (!output)
1562 return 0;
1563 if (!PyBytes_Check(output)) {
1564 Py_DECREF(output);
1565 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1566 return 0;
1567 }
1568 }
1569 if (PyBytes_Check(output)) {
1570 size = PyBytes_GET_SIZE(output);
1571 data = PyBytes_AS_STRING(output);
Ezio Melottiba42fd52011-04-26 06:09:45 +03001572 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001573 else {
1574 size = PyByteArray_GET_SIZE(output);
1575 data = PyByteArray_AS_STRING(output);
1576 }
1577 if (size != strlen(data)) {
1578 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1579 Py_DECREF(output);
1580 return 0;
1581 }
1582 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001583 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001584}
1585
1586
Martin v. Löwis5b222132007-06-10 09:51:05 +00001587char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001588_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001589{
Christian Heimesf3863112007-11-22 07:46:41 +00001590 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001591 if (!PyUnicode_Check(unicode)) {
1592 PyErr_BadArgument();
1593 return NULL;
1594 }
Christian Heimesf3863112007-11-22 07:46:41 +00001595 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1596 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001597 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001598 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001599 *psize = PyBytes_GET_SIZE(bytes);
1600 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001601}
1602
1603char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001604_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001605{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001606 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001607}
1608
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1610{
1611 if (!PyUnicode_Check(unicode)) {
1612 PyErr_BadArgument();
1613 goto onError;
1614 }
1615 return PyUnicode_AS_UNICODE(unicode);
1616
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 return NULL;
1619}
1620
Martin v. Löwis18e16552006-02-15 17:27:45 +00001621Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622{
1623 if (!PyUnicode_Check(unicode)) {
1624 PyErr_BadArgument();
1625 goto onError;
1626 }
1627 return PyUnicode_GET_SIZE(unicode);
1628
Benjamin Peterson29060642009-01-31 22:14:21 +00001629 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 return -1;
1631}
1632
Thomas Wouters78890102000-07-22 19:25:51 +00001633const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001634{
1635 return unicode_default_encoding;
1636}
1637
1638int PyUnicode_SetDefaultEncoding(const char *encoding)
1639{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001640 if (strcmp(encoding, unicode_default_encoding) != 0) {
1641 PyErr_Format(PyExc_ValueError,
1642 "Can only set default encoding to %s",
1643 unicode_default_encoding);
1644 return -1;
1645 }
Fred Drakee4315f52000-05-09 19:53:39 +00001646 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001647}
1648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649/* error handling callback helper:
1650 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001651 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 and adjust various state variables.
1653 return 0 on success, -1 on error
1654*/
1655
1656static
1657int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001658 const char *encoding, const char *reason,
1659 const char **input, const char **inend, Py_ssize_t *startinpos,
1660 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1661 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001663 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664
1665 PyObject *restuple = NULL;
1666 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001667 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001668 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001669 Py_ssize_t requiredsize;
1670 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001671 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001672 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001673 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674 int res = -1;
1675
1676 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001677 *errorHandler = PyCodec_LookupError(errors);
1678 if (*errorHandler == NULL)
1679 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 }
1681
1682 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001683 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001684 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1685 if (*exceptionObject == NULL)
1686 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001687 }
1688 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001689 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1690 goto onError;
1691 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1692 goto onError;
1693 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1694 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001695 }
1696
1697 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1698 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001699 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001701 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001702 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001703 }
1704 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001705 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001706
1707 /* Copy back the bytes variables, which might have been modified by the
1708 callback */
1709 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1710 if (!inputobj)
1711 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001712 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001713 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001714 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001715 *input = PyBytes_AS_STRING(inputobj);
1716 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001717 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001718 /* we can DECREF safely, as the exception has another reference,
1719 so the object won't go away. */
1720 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001724 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001725 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1726 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001727 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728
1729 /* need more space? (at least enough for what we
1730 have+the replacement+the rest of the string (starting
1731 at the new input position), so we won't have to check space
1732 when there are no errors in the rest of the string) */
1733 repptr = PyUnicode_AS_UNICODE(repunicode);
1734 repsize = PyUnicode_GET_SIZE(repunicode);
1735 requiredsize = *outpos + repsize + insize-newpos;
1736 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001737 if (requiredsize<2*outsize)
1738 requiredsize = 2*outsize;
1739 if (_PyUnicode_Resize(output, requiredsize) < 0)
1740 goto onError;
1741 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 }
1743 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001744 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001745 Py_UNICODE_COPY(*outptr, repptr, repsize);
1746 *outptr += repsize;
1747 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001748
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001749 /* we made it! */
1750 res = 0;
1751
Benjamin Peterson29060642009-01-31 22:14:21 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(restuple);
1754 return res;
1755}
1756
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757/* --- UTF-7 Codec -------------------------------------------------------- */
1758
Antoine Pitrou244651a2009-05-04 18:56:13 +00001759/* See RFC2152 for details. We encode conservatively and decode liberally. */
1760
1761/* Three simple macros defining base-64. */
1762
1763/* Is c a base-64 character? */
1764
1765#define IS_BASE64(c) \
1766 (((c) >= 'A' && (c) <= 'Z') || \
1767 ((c) >= 'a' && (c) <= 'z') || \
1768 ((c) >= '0' && (c) <= '9') || \
1769 (c) == '+' || (c) == '/')
1770
1771/* given that c is a base-64 character, what is its base-64 value? */
1772
1773#define FROM_BASE64(c) \
1774 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1775 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1776 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1777 (c) == '+' ? 62 : 63)
1778
1779/* What is the base-64 character of the bottom 6 bits of n? */
1780
1781#define TO_BASE64(n) \
1782 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1783
1784/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1785 * decoded as itself. We are permissive on decoding; the only ASCII
1786 * byte not decoding to itself is the + which begins a base64
1787 * string. */
1788
1789#define DECODE_DIRECT(c) \
1790 ((c) <= 127 && (c) != '+')
1791
1792/* The UTF-7 encoder treats ASCII characters differently according to
1793 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1794 * the above). See RFC2152. This array identifies these different
1795 * sets:
1796 * 0 : "Set D"
1797 * alphanumeric and '(),-./:?
1798 * 1 : "Set O"
1799 * !"#$%&*;<=>@[]^_`{|}
1800 * 2 : "whitespace"
1801 * ht nl cr sp
1802 * 3 : special (must be base64 encoded)
1803 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1804 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001805
Tim Petersced69f82003-09-16 20:30:58 +00001806static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001807char utf7_category[128] = {
1808/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1809 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1810/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1811 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1812/* sp ! " # $ % & ' ( ) * + , - . / */
1813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1814/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1815 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1816/* @ A B C D E F G H I J K L M N O */
1817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1818/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1819 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1820/* ` a b c d e f g h i j k l m n o */
1821 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1822/* p q r s t u v w x y z { | } ~ del */
1823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001824};
1825
Antoine Pitrou244651a2009-05-04 18:56:13 +00001826/* ENCODE_DIRECT: this character should be encoded as itself. The
1827 * answer depends on whether we are encoding set O as itself, and also
1828 * on whether we are encoding whitespace as itself. RFC2152 makes it
1829 * clear that the answers to these questions vary between
1830 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001831
Antoine Pitrou244651a2009-05-04 18:56:13 +00001832#define ENCODE_DIRECT(c, directO, directWS) \
1833 ((c) < 128 && (c) > 0 && \
1834 ((utf7_category[(c)] == 0) || \
1835 (directWS && (utf7_category[(c)] == 2)) || \
1836 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001838PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001839 Py_ssize_t size,
1840 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001841{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001842 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1843}
1844
Antoine Pitrou244651a2009-05-04 18:56:13 +00001845/* The decoder. The only state we preserve is our read position,
1846 * i.e. how many characters we have consumed. So if we end in the
1847 * middle of a shift sequence we have to back off the read position
1848 * and the output to the beginning of the sequence, otherwise we lose
1849 * all the shift state (seen bits, number of bits seen, high
1850 * surrogate). */
1851
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001852PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001853 Py_ssize_t size,
1854 const char *errors,
1855 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001858 Py_ssize_t startinpos;
1859 Py_ssize_t endinpos;
1860 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861 const char *e;
1862 PyUnicodeObject *unicode;
1863 Py_UNICODE *p;
1864 const char *errmsg = "";
1865 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001866 Py_UNICODE *shiftOutStart;
1867 unsigned int base64bits = 0;
1868 unsigned long base64buffer = 0;
1869 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870 PyObject *errorHandler = NULL;
1871 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872
1873 unicode = _PyUnicode_New(size);
1874 if (!unicode)
1875 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001876 if (size == 0) {
1877 if (consumed)
1878 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001880 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881
1882 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001883 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001884 e = s + size;
1885
1886 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001887 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001888 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001889 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001890
Antoine Pitrou244651a2009-05-04 18:56:13 +00001891 if (inShift) { /* in a base-64 section */
1892 if (IS_BASE64(ch)) { /* consume a base-64 character */
1893 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1894 base64bits += 6;
1895 s++;
1896 if (base64bits >= 16) {
1897 /* we have enough bits for a UTF-16 value */
1898 Py_UNICODE outCh = (Py_UNICODE)
1899 (base64buffer >> (base64bits-16));
1900 base64bits -= 16;
1901 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1902 if (surrogate) {
1903 /* expecting a second surrogate */
1904 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1905#ifdef Py_UNICODE_WIDE
1906 *p++ = (((surrogate & 0x3FF)<<10)
1907 | (outCh & 0x3FF)) + 0x10000;
1908#else
1909 *p++ = surrogate;
1910 *p++ = outCh;
1911#endif
1912 surrogate = 0;
1913 }
1914 else {
1915 surrogate = 0;
1916 errmsg = "second surrogate missing";
1917 goto utf7Error;
1918 }
1919 }
1920 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1921 /* first surrogate */
1922 surrogate = outCh;
1923 }
1924 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1925 errmsg = "unexpected second surrogate";
1926 goto utf7Error;
1927 }
1928 else {
1929 *p++ = outCh;
1930 }
1931 }
1932 }
1933 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001934 inShift = 0;
1935 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001936 if (surrogate) {
1937 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001938 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001940 if (base64bits > 0) { /* left-over bits */
1941 if (base64bits >= 6) {
1942 /* We've seen at least one base-64 character */
1943 errmsg = "partial character in shift sequence";
1944 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001945 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001946 else {
1947 /* Some bits remain; they should be zero */
1948 if (base64buffer != 0) {
1949 errmsg = "non-zero padding bits in shift sequence";
1950 goto utf7Error;
1951 }
1952 }
1953 }
1954 if (ch != '-') {
1955 /* '-' is absorbed; other terminating
1956 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 *p++ = ch;
1958 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001959 }
1960 }
1961 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001963 s++; /* consume '+' */
1964 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965 s++;
1966 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 }
1968 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001969 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001970 shiftOutStart = p;
1971 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001972 }
1973 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001974 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001975 *p++ = ch;
1976 s++;
1977 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001978 else {
1979 startinpos = s-starts;
1980 s++;
1981 errmsg = "unexpected special character";
1982 goto utf7Error;
1983 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001984 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001985utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001986 outpos = p-PyUnicode_AS_UNICODE(unicode);
1987 endinpos = s-starts;
1988 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001989 errors, &errorHandler,
1990 "utf7", errmsg,
1991 &starts, &e, &startinpos, &endinpos, &exc, &s,
1992 &unicode, &outpos, &p))
1993 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001994 }
1995
Antoine Pitrou244651a2009-05-04 18:56:13 +00001996 /* end of string */
1997
1998 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1999 /* if we're in an inconsistent state, that's an error */
2000 if (surrogate ||
2001 (base64bits >= 6) ||
2002 (base64bits > 0 && base64buffer != 0)) {
2003 outpos = p-PyUnicode_AS_UNICODE(unicode);
2004 endinpos = size;
2005 if (unicode_decode_call_errorhandler(
2006 errors, &errorHandler,
2007 "utf7", "unterminated shift sequence",
2008 &starts, &e, &startinpos, &endinpos, &exc, &s,
2009 &unicode, &outpos, &p))
2010 goto onError;
2011 if (s < e)
2012 goto restart;
2013 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002014 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002015
2016 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002017 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002018 if (inShift) {
2019 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002020 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002021 }
2022 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002023 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002024 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002025 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002026
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002027 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002028 goto onError;
2029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 Py_XDECREF(errorHandler);
2031 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002032 return (PyObject *)unicode;
2033
Benjamin Peterson29060642009-01-31 22:14:21 +00002034 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 Py_XDECREF(errorHandler);
2036 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002037 Py_DECREF(unicode);
2038 return NULL;
2039}
2040
2041
2042PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002043 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002044 int base64SetO,
2045 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002046 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002047{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002048 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002049 /* It might be possible to tighten this worst case */
Georg Brandl194da4a2009-08-13 09:34:05 +00002050 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002051 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002052 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002053 unsigned int base64bits = 0;
2054 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002055 char * out;
2056 char * start;
2057
2058 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060
Georg Brandl194da4a2009-08-13 09:34:05 +00002061 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002062 return PyErr_NoMemory();
2063
Antoine Pitrou244651a2009-05-04 18:56:13 +00002064 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002065 if (v == NULL)
2066 return NULL;
2067
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002068 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002069 for (;i < size; ++i) {
2070 Py_UNICODE ch = s[i];
2071
Antoine Pitrou244651a2009-05-04 18:56:13 +00002072 if (inShift) {
2073 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2074 /* shifting out */
2075 if (base64bits) { /* output remaining bits */
2076 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2077 base64buffer = 0;
2078 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002079 }
2080 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002081 /* Characters not in the BASE64 set implicitly unshift the sequence
2082 so no '-' is required, except if the character is itself a '-' */
2083 if (IS_BASE64(ch) || ch == '-') {
2084 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002085 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002086 *out++ = (char) ch;
2087 }
2088 else {
2089 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002090 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002091 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002092 else { /* not in a shift sequence */
2093 if (ch == '+') {
2094 *out++ = '+';
2095 *out++ = '-';
2096 }
2097 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2098 *out++ = (char) ch;
2099 }
2100 else {
2101 *out++ = '+';
2102 inShift = 1;
2103 goto encode_char;
2104 }
2105 }
2106 continue;
2107encode_char:
2108#ifdef Py_UNICODE_WIDE
2109 if (ch >= 0x10000) {
2110 /* code first surrogate */
2111 base64bits += 16;
2112 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2113 while (base64bits >= 6) {
2114 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2115 base64bits -= 6;
2116 }
2117 /* prepare second surrogate */
2118 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2119 }
2120#endif
2121 base64bits += 16;
2122 base64buffer = (base64buffer << 16) | ch;
2123 while (base64bits >= 6) {
2124 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2125 base64bits -= 6;
2126 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002127 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002128 if (base64bits)
2129 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2130 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002131 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002132 if (_PyBytes_Resize(&v, out - start) < 0)
2133 return NULL;
2134 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135}
2136
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137#undef IS_BASE64
2138#undef FROM_BASE64
2139#undef TO_BASE64
2140#undef DECODE_DIRECT
2141#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002142
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143/* --- UTF-8 Codec -------------------------------------------------------- */
2144
Tim Petersced69f82003-09-16 20:30:58 +00002145static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146char utf8_code_length[256] = {
Ezio Melotti25bc0192010-07-03 05:18:50 +00002147 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2148 illegal prefix. See RFC 3629 for details */
2149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottiba42fd52011-04-26 06:09:45 +03002151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti25bc0192010-07-03 05:18:50 +00002156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti25bc0192010-07-03 05:18:50 +00002160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2161 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2162 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2163 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2164 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165};
2166
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002168 Py_ssize_t size,
2169 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170{
Walter Dörwald69652032004-09-07 20:24:22 +00002171 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2172}
2173
Antoine Pitrouab868312009-01-10 15:40:25 +00002174/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2175#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2176
2177/* Mask to quickly check whether a C 'long' contains a
2178 non-ASCII, UTF8-encoded char. */
2179#if (SIZEOF_LONG == 8)
2180# define ASCII_CHAR_MASK 0x8080808080808080L
2181#elif (SIZEOF_LONG == 4)
2182# define ASCII_CHAR_MASK 0x80808080L
2183#else
2184# error C 'long' size should be either 4 or 8!
2185#endif
2186
Walter Dörwald69652032004-09-07 20:24:22 +00002187PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002188 Py_ssize_t size,
2189 const char *errors,
2190 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002191{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002192 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 int n;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002194 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002195 Py_ssize_t startinpos;
2196 Py_ssize_t endinpos;
2197 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002198 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 PyUnicodeObject *unicode;
2200 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002201 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 PyObject *errorHandler = NULL;
2203 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204
2205 /* Note: size will always be longer than the resulting Unicode
2206 character count */
2207 unicode = _PyUnicode_New(size);
2208 if (!unicode)
2209 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002210 if (size == 0) {
2211 if (consumed)
2212 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215
2216 /* Unpack UTF-8 encoded data */
2217 p = unicode->str;
2218 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002219 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220
2221 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002222 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223
2224 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002225 /* Fast path for runs of ASCII characters. Given that common UTF-8
2226 input will consist of an overwhelming majority of ASCII
2227 characters, we try to optimize for this case by checking
2228 as many characters as a C 'long' can contain.
2229 First, check if we can do an aligned read, as most CPUs have
2230 a penalty for unaligned reads.
2231 */
2232 if (!((size_t) s & LONG_PTR_MASK)) {
2233 /* Help register allocation */
2234 register const char *_s = s;
2235 register Py_UNICODE *_p = p;
2236 while (_s < aligned_end) {
2237 /* Read a whole long at a time (either 4 or 8 bytes),
2238 and do a fast unrolled copy if it only contains ASCII
2239 characters. */
2240 unsigned long data = *(unsigned long *) _s;
2241 if (data & ASCII_CHAR_MASK)
2242 break;
2243 _p[0] = (unsigned char) _s[0];
2244 _p[1] = (unsigned char) _s[1];
2245 _p[2] = (unsigned char) _s[2];
2246 _p[3] = (unsigned char) _s[3];
2247#if (SIZEOF_LONG == 8)
2248 _p[4] = (unsigned char) _s[4];
2249 _p[5] = (unsigned char) _s[5];
2250 _p[6] = (unsigned char) _s[6];
2251 _p[7] = (unsigned char) _s[7];
2252#endif
2253 _s += SIZEOF_LONG;
2254 _p += SIZEOF_LONG;
2255 }
2256 s = _s;
2257 p = _p;
2258 if (s == e)
2259 break;
2260 ch = (unsigned char)*s;
2261 }
2262 }
2263
2264 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002265 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 s++;
2267 continue;
2268 }
2269
2270 n = utf8_code_length[ch];
2271
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002272 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002273 if (consumed)
2274 break;
2275 else {
2276 errmsg = "unexpected end of data";
2277 startinpos = s-starts;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002278 endinpos = startinpos+1;
2279 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2280 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002281 goto utf8Error;
2282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284
2285 switch (n) {
2286
2287 case 0:
Ezio Melotti25bc0192010-07-03 05:18:50 +00002288 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 startinpos = s-starts;
2290 endinpos = startinpos+1;
2291 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002294 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002295 startinpos = s-starts;
2296 endinpos = startinpos+1;
2297 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298
2299 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002300 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti25bc0192010-07-03 05:18:50 +00002301 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 startinpos = s-starts;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002303 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002304 goto utf8Error;
2305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti25bc0192010-07-03 05:18:50 +00002307 assert ((ch > 0x007F) && (ch <= 0x07FF));
2308 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 break;
2310
2311 case 3:
Ezio Melotti25bc0192010-07-03 05:18:50 +00002312 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2313 will result in surrogates in range d800-dfff. Surrogates are
2314 not valid UTF-8 so they are rejected.
2315 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2316 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002317 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti25bc0192010-07-03 05:18:50 +00002318 (s[2] & 0xc0) != 0x80 ||
2319 ((unsigned char)s[0] == 0xE0 &&
2320 (unsigned char)s[1] < 0xA0) ||
2321 ((unsigned char)s[0] == 0xED &&
2322 (unsigned char)s[1] > 0x9F)) {
2323 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002324 startinpos = s-starts;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002325 endinpos = startinpos + 1;
2326
2327 /* if s[1] first two bits are 1 and 0, then the invalid
2328 continuation byte is s[2], so increment endinpos by 1,
2329 if not, s[1] is invalid and endinpos doesn't need to
2330 be incremented. */
2331 if ((s[1] & 0xC0) == 0x80)
2332 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002333 goto utf8Error;
2334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti25bc0192010-07-03 05:18:50 +00002336 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2337 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002338 break;
2339
2340 case 4:
2341 if ((s[1] & 0xc0) != 0x80 ||
2342 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti25bc0192010-07-03 05:18:50 +00002343 (s[3] & 0xc0) != 0x80 ||
2344 ((unsigned char)s[0] == 0xF0 &&
2345 (unsigned char)s[1] < 0x90) ||
2346 ((unsigned char)s[0] == 0xF4 &&
2347 (unsigned char)s[1] > 0x8F)) {
2348 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002349 startinpos = s-starts;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002350 endinpos = startinpos + 1;
2351 if ((s[1] & 0xC0) == 0x80) {
2352 endinpos++;
2353 if ((s[2] & 0xC0) == 0x80)
2354 endinpos++;
2355 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002356 goto utf8Error;
2357 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002358 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti25bc0192010-07-03 05:18:50 +00002359 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2360 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2361
Fredrik Lundh8f455852001-06-27 18:59:43 +00002362#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002363 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002364#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002365 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002366
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002367 /* translate from 10000..10FFFF to 0..FFFF */
2368 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002369
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002370 /* high surrogate = top 10 bits added to D800 */
2371 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002372
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002373 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002374 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002375#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 }
2378 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002379 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002380
Benjamin Peterson29060642009-01-31 22:14:21 +00002381 utf8Error:
2382 outpos = p-PyUnicode_AS_UNICODE(unicode);
2383 if (unicode_decode_call_errorhandler(
2384 errors, &errorHandler,
2385 "utf8", errmsg,
2386 &starts, &e, &startinpos, &endinpos, &exc, &s,
2387 &unicode, &outpos, &p))
2388 goto onError;
2389 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390 }
Walter Dörwald69652032004-09-07 20:24:22 +00002391 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002392 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393
2394 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002395 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396 goto onError;
2397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 Py_XDECREF(errorHandler);
2399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400 return (PyObject *)unicode;
2401
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002403 Py_XDECREF(errorHandler);
2404 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405 Py_DECREF(unicode);
2406 return NULL;
2407}
2408
Antoine Pitrouab868312009-01-10 15:40:25 +00002409#undef ASCII_CHAR_MASK
2410
2411
Tim Peters602f7402002-04-27 18:03:26 +00002412/* Allocation strategy: if the string is short, convert into a stack buffer
2413 and allocate exactly as much space needed at the end. Else allocate the
2414 maximum possible needed (4 result bytes per Unicode character), and return
2415 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002416*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002417PyObject *
2418PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 Py_ssize_t size,
2420 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421{
Tim Peters602f7402002-04-27 18:03:26 +00002422#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002423
Guido van Rossum98297ee2007-11-06 21:34:58 +00002424 Py_ssize_t i; /* index into s of next input byte */
2425 PyObject *result; /* result string object */
2426 char *p; /* next free byte in output buffer */
2427 Py_ssize_t nallocated; /* number of result bytes allocated */
2428 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002429 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002430 PyObject *errorHandler = NULL;
2431 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002432
Tim Peters602f7402002-04-27 18:03:26 +00002433 assert(s != NULL);
2434 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435
Tim Peters602f7402002-04-27 18:03:26 +00002436 if (size <= MAX_SHORT_UNICHARS) {
2437 /* Write into the stack buffer; nallocated can't overflow.
2438 * At the end, we'll allocate exactly as much heap space as it
2439 * turns out we need.
2440 */
2441 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002442 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002443 p = stackbuf;
2444 }
2445 else {
2446 /* Overallocate on the heap, and give the excess back at the end. */
2447 nallocated = size * 4;
2448 if (nallocated / 4 != size) /* overflow! */
2449 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002450 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002451 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002452 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002453 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002454 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002455
Tim Peters602f7402002-04-27 18:03:26 +00002456 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002457 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002458
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002459 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002460 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002462
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002464 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002465 *p++ = (char)(0xc0 | (ch >> 6));
2466 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner158701d2010-04-22 19:41:01 +00002467 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002468#ifndef Py_UNICODE_WIDE
Victor Stinner158701d2010-04-22 19:41:01 +00002469 /* Special case: check for high and low surrogate */
2470 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2471 Py_UCS4 ch2 = s[i];
2472 /* Combine the two surrogates to form a UCS4 value */
2473 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2474 i++;
2475
2476 /* Encode UCS4 Unicode ordinals */
2477 *p++ = (char)(0xf0 | (ch >> 18));
2478 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2480 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner158701d2010-04-22 19:41:01 +00002481 } else {
Victor Stinner0b79b762010-04-22 20:07:28 +00002482#endif
Victor Stinner158701d2010-04-22 19:41:01 +00002483 Py_ssize_t newpos;
2484 PyObject *rep;
2485 Py_ssize_t repsize, k;
2486 rep = unicode_encode_call_errorhandler
2487 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2488 s, size, &exc, i-1, i, &newpos);
2489 if (!rep)
2490 goto error;
2491
2492 if (PyBytes_Check(rep))
2493 repsize = PyBytes_GET_SIZE(rep);
2494 else
2495 repsize = PyUnicode_GET_SIZE(rep);
2496
2497 if (repsize > 4) {
2498 Py_ssize_t offset;
2499
2500 if (result == NULL)
2501 offset = p - stackbuf;
2502 else
2503 offset = p - PyBytes_AS_STRING(result);
2504
2505 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2506 /* integer overflow */
2507 PyErr_NoMemory();
2508 goto error;
2509 }
2510 nallocated += repsize - 4;
2511 if (result != NULL) {
2512 if (_PyBytes_Resize(&result, nallocated) < 0)
2513 goto error;
2514 } else {
2515 result = PyBytes_FromStringAndSize(NULL, nallocated);
2516 if (result == NULL)
2517 goto error;
2518 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2519 }
2520 p = PyBytes_AS_STRING(result) + offset;
2521 }
2522
2523 if (PyBytes_Check(rep)) {
2524 char *prep = PyBytes_AS_STRING(rep);
2525 for(k = repsize; k > 0; k--)
2526 *p++ = *prep++;
2527 } else /* rep is unicode */ {
2528 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2529 Py_UNICODE c;
2530
2531 for(k=0; k<repsize; k++) {
2532 c = prep[k];
2533 if (0x80 <= c) {
2534 raise_encode_exception(&exc, "utf-8", s, size,
2535 i-1, i, "surrogates not allowed");
2536 goto error;
2537 }
2538 *p++ = (char)prep[k];
2539 }
2540 }
2541 Py_DECREF(rep);
Victor Stinner0b79b762010-04-22 20:07:28 +00002542#ifndef Py_UNICODE_WIDE
Victor Stinner158701d2010-04-22 19:41:01 +00002543 }
Victor Stinner0b79b762010-04-22 20:07:28 +00002544#endif
Victor Stinner158701d2010-04-22 19:41:01 +00002545 } else if (ch < 0x10000) {
2546 *p++ = (char)(0xe0 | (ch >> 12));
2547 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2548 *p++ = (char)(0x80 | (ch & 0x3f));
2549 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002550 /* Encode UCS4 Unicode ordinals */
2551 *p++ = (char)(0xf0 | (ch >> 18));
2552 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2553 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2554 *p++ = (char)(0x80 | (ch & 0x3f));
2555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002557
Guido van Rossum98297ee2007-11-06 21:34:58 +00002558 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002559 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002560 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002561 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002562 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002563 }
2564 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002565 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002566 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002567 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002568 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002569 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002570 Py_XDECREF(errorHandler);
2571 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002572 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002573 error:
2574 Py_XDECREF(errorHandler);
2575 Py_XDECREF(exc);
2576 Py_XDECREF(result);
2577 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002578
Tim Peters602f7402002-04-27 18:03:26 +00002579#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580}
2581
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2583{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 if (!PyUnicode_Check(unicode)) {
2585 PyErr_BadArgument();
2586 return NULL;
2587 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002588 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002589 PyUnicode_GET_SIZE(unicode),
2590 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591}
2592
Walter Dörwald41980ca2007-08-16 21:55:45 +00002593/* --- UTF-32 Codec ------------------------------------------------------- */
2594
2595PyObject *
2596PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002597 Py_ssize_t size,
2598 const char *errors,
2599 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002600{
2601 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2602}
2603
2604PyObject *
2605PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002606 Py_ssize_t size,
2607 const char *errors,
2608 int *byteorder,
2609 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002610{
2611 const char *starts = s;
2612 Py_ssize_t startinpos;
2613 Py_ssize_t endinpos;
2614 Py_ssize_t outpos;
2615 PyUnicodeObject *unicode;
2616 Py_UNICODE *p;
2617#ifndef Py_UNICODE_WIDE
Antoine Pitrou6107a682010-06-11 21:48:34 +00002618 int pairs = 0;
Georg Brandlded5acf2010-10-17 11:48:07 +00002619 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002620#else
2621 const int pairs = 0;
2622#endif
Georg Brandlded5acf2010-10-17 11:48:07 +00002623 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002624 int bo = 0; /* assume native ordering by default */
2625 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002626 /* Offsets from q for retrieving bytes in the right order. */
2627#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2628 int iorder[] = {0, 1, 2, 3};
2629#else
2630 int iorder[] = {3, 2, 1, 0};
2631#endif
2632 PyObject *errorHandler = NULL;
2633 PyObject *exc = NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03002634
Walter Dörwald41980ca2007-08-16 21:55:45 +00002635 q = (unsigned char *)s;
2636 e = q + size;
2637
2638 if (byteorder)
2639 bo = *byteorder;
2640
2641 /* Check for BOM marks (U+FEFF) in the input and adjust current
2642 byte order setting accordingly. In native mode, the leading BOM
2643 mark is skipped, in all other modes, it is copied to the output
2644 stream as-is (giving a ZWNBSP character). */
2645 if (bo == 0) {
2646 if (size >= 4) {
2647 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002648 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002649#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002650 if (bom == 0x0000FEFF) {
2651 q += 4;
2652 bo = -1;
2653 }
2654 else if (bom == 0xFFFE0000) {
2655 q += 4;
2656 bo = 1;
2657 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002658#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002659 if (bom == 0x0000FEFF) {
2660 q += 4;
2661 bo = 1;
2662 }
2663 else if (bom == 0xFFFE0000) {
2664 q += 4;
2665 bo = -1;
2666 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002667#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002668 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002669 }
2670
2671 if (bo == -1) {
2672 /* force LE */
2673 iorder[0] = 0;
2674 iorder[1] = 1;
2675 iorder[2] = 2;
2676 iorder[3] = 3;
2677 }
2678 else if (bo == 1) {
2679 /* force BE */
2680 iorder[0] = 3;
2681 iorder[1] = 2;
2682 iorder[2] = 1;
2683 iorder[3] = 0;
2684 }
2685
Antoine Pitrou6107a682010-06-11 21:48:34 +00002686 /* On narrow builds we split characters outside the BMP into two
2687 codepoints => count how much extra space we need. */
2688#ifndef Py_UNICODE_WIDE
2689 for (qq = q; qq < e; qq += 4)
2690 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2691 pairs++;
2692#endif
2693
2694 /* This might be one to much, because of a BOM */
2695 unicode = _PyUnicode_New((size+3)/4+pairs);
2696 if (!unicode)
2697 return NULL;
2698 if (size == 0)
2699 return (PyObject *)unicode;
2700
2701 /* Unpack UTF-32 encoded data */
2702 p = unicode->str;
2703
Walter Dörwald41980ca2007-08-16 21:55:45 +00002704 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002705 Py_UCS4 ch;
2706 /* remaining bytes at the end? (size should be divisible by 4) */
2707 if (e-q<4) {
2708 if (consumed)
2709 break;
2710 errmsg = "truncated data";
2711 startinpos = ((const char *)q)-starts;
2712 endinpos = ((const char *)e)-starts;
2713 goto utf32Error;
2714 /* The remaining input chars are ignored if the callback
2715 chooses to skip the input */
2716 }
2717 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2718 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002719
Benjamin Peterson29060642009-01-31 22:14:21 +00002720 if (ch >= 0x110000)
2721 {
2722 errmsg = "codepoint not in range(0x110000)";
2723 startinpos = ((const char *)q)-starts;
2724 endinpos = startinpos+4;
2725 goto utf32Error;
2726 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002727#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 if (ch >= 0x10000)
2729 {
2730 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2731 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2732 }
2733 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002734#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002735 *p++ = ch;
2736 q += 4;
2737 continue;
2738 utf32Error:
2739 outpos = p-PyUnicode_AS_UNICODE(unicode);
2740 if (unicode_decode_call_errorhandler(
2741 errors, &errorHandler,
2742 "utf32", errmsg,
2743 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2744 &unicode, &outpos, &p))
2745 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002746 }
2747
2748 if (byteorder)
2749 *byteorder = bo;
2750
2751 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002752 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002753
2754 /* Adjust length */
2755 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2756 goto onError;
2757
2758 Py_XDECREF(errorHandler);
2759 Py_XDECREF(exc);
2760 return (PyObject *)unicode;
2761
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002763 Py_DECREF(unicode);
2764 Py_XDECREF(errorHandler);
2765 Py_XDECREF(exc);
2766 return NULL;
2767}
2768
2769PyObject *
2770PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 Py_ssize_t size,
2772 const char *errors,
2773 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002774{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002775 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002776 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002777 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002779 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002780#else
2781 const int pairs = 0;
2782#endif
2783 /* Offsets from p for storing byte pairs in the right order. */
2784#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2785 int iorder[] = {0, 1, 2, 3};
2786#else
2787 int iorder[] = {3, 2, 1, 0};
2788#endif
2789
Benjamin Peterson29060642009-01-31 22:14:21 +00002790#define STORECHAR(CH) \
2791 do { \
2792 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2793 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2794 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2795 p[iorder[0]] = (CH) & 0xff; \
2796 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002797 } while(0)
2798
2799 /* In narrow builds we can output surrogate pairs as one codepoint,
2800 so we need less space. */
2801#ifndef Py_UNICODE_WIDE
2802 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002803 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2804 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2805 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002806#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002807 nsize = (size - pairs + (byteorder == 0));
2808 bytesize = nsize * 4;
2809 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002811 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002812 if (v == NULL)
2813 return NULL;
2814
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002815 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002816 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002817 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002818 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002819 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002820
2821 if (byteorder == -1) {
2822 /* force LE */
2823 iorder[0] = 0;
2824 iorder[1] = 1;
2825 iorder[2] = 2;
2826 iorder[3] = 3;
2827 }
2828 else if (byteorder == 1) {
2829 /* force BE */
2830 iorder[0] = 3;
2831 iorder[1] = 2;
2832 iorder[2] = 1;
2833 iorder[3] = 0;
2834 }
2835
2836 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002838#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2840 Py_UCS4 ch2 = *s;
2841 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2842 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2843 s++;
2844 size--;
2845 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002846 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002847#endif
2848 STORECHAR(ch);
2849 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002850
2851 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002852 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002853#undef STORECHAR
2854}
2855
2856PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2857{
2858 if (!PyUnicode_Check(unicode)) {
2859 PyErr_BadArgument();
2860 return NULL;
2861 }
2862 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002863 PyUnicode_GET_SIZE(unicode),
2864 NULL,
2865 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002866}
2867
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868/* --- UTF-16 Codec ------------------------------------------------------- */
2869
Tim Peters772747b2001-08-09 22:21:55 +00002870PyObject *
2871PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002872 Py_ssize_t size,
2873 const char *errors,
2874 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875{
Walter Dörwald69652032004-09-07 20:24:22 +00002876 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2877}
2878
Antoine Pitrouab868312009-01-10 15:40:25 +00002879/* Two masks for fast checking of whether a C 'long' may contain
2880 UTF16-encoded surrogate characters. This is an efficient heuristic,
2881 assuming that non-surrogate characters with a code point >= 0x8000 are
2882 rare in most input.
2883 FAST_CHAR_MASK is used when the input is in native byte ordering,
2884 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002885*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002886#if (SIZEOF_LONG == 8)
2887# define FAST_CHAR_MASK 0x8000800080008000L
2888# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2889#elif (SIZEOF_LONG == 4)
2890# define FAST_CHAR_MASK 0x80008000L
2891# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2892#else
2893# error C 'long' size should be either 4 or 8!
2894#endif
2895
Walter Dörwald69652032004-09-07 20:24:22 +00002896PyObject *
2897PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002898 Py_ssize_t size,
2899 const char *errors,
2900 int *byteorder,
2901 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002902{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002904 Py_ssize_t startinpos;
2905 Py_ssize_t endinpos;
2906 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 PyUnicodeObject *unicode;
2908 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002909 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002910 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002911 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002912 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002913 /* Offsets from q for retrieving byte pairs in the right order. */
2914#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2915 int ihi = 1, ilo = 0;
2916#else
2917 int ihi = 0, ilo = 1;
2918#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002919 PyObject *errorHandler = NULL;
2920 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921
2922 /* Note: size will always be longer than the resulting Unicode
2923 character count */
2924 unicode = _PyUnicode_New(size);
2925 if (!unicode)
2926 return NULL;
2927 if (size == 0)
2928 return (PyObject *)unicode;
2929
2930 /* Unpack UTF-16 encoded data */
2931 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002932 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002933 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934
2935 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002936 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002938 /* Check for BOM marks (U+FEFF) in the input and adjust current
2939 byte order setting accordingly. In native mode, the leading BOM
2940 mark is skipped, in all other modes, it is copied to the output
2941 stream as-is (giving a ZWNBSP character). */
2942 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002943 if (size >= 2) {
2944 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002945#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002946 if (bom == 0xFEFF) {
2947 q += 2;
2948 bo = -1;
2949 }
2950 else if (bom == 0xFFFE) {
2951 q += 2;
2952 bo = 1;
2953 }
Tim Petersced69f82003-09-16 20:30:58 +00002954#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002955 if (bom == 0xFEFF) {
2956 q += 2;
2957 bo = 1;
2958 }
2959 else if (bom == 0xFFFE) {
2960 q += 2;
2961 bo = -1;
2962 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002963#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002964 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
Tim Peters772747b2001-08-09 22:21:55 +00002967 if (bo == -1) {
2968 /* force LE */
2969 ihi = 1;
2970 ilo = 0;
2971 }
2972 else if (bo == 1) {
2973 /* force BE */
2974 ihi = 0;
2975 ilo = 1;
2976 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002977#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2978 native_ordering = ilo < ihi;
2979#else
2980 native_ordering = ilo > ihi;
2981#endif
Tim Peters772747b2001-08-09 22:21:55 +00002982
Antoine Pitrouab868312009-01-10 15:40:25 +00002983 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002984 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002986 /* First check for possible aligned read of a C 'long'. Unaligned
2987 reads are more expensive, better to defer to another iteration. */
2988 if (!((size_t) q & LONG_PTR_MASK)) {
2989 /* Fast path for runs of non-surrogate chars. */
2990 register const unsigned char *_q = q;
2991 Py_UNICODE *_p = p;
2992 if (native_ordering) {
2993 /* Native ordering is simple: as long as the input cannot
2994 possibly contain a surrogate char, do an unrolled copy
2995 of several 16-bit code points to the target object.
2996 The non-surrogate check is done on several input bytes
2997 at a time (as many as a C 'long' can contain). */
2998 while (_q < aligned_end) {
2999 unsigned long data = * (unsigned long *) _q;
3000 if (data & FAST_CHAR_MASK)
3001 break;
3002 _p[0] = ((unsigned short *) _q)[0];
3003 _p[1] = ((unsigned short *) _q)[1];
3004#if (SIZEOF_LONG == 8)
3005 _p[2] = ((unsigned short *) _q)[2];
3006 _p[3] = ((unsigned short *) _q)[3];
3007#endif
3008 _q += SIZEOF_LONG;
3009 _p += SIZEOF_LONG / 2;
3010 }
3011 }
3012 else {
3013 /* Byteswapped ordering is similar, but we must decompose
3014 the copy bytewise, and take care of zero'ing out the
3015 upper bytes if the target object is in 32-bit units
3016 (that is, in UCS-4 builds). */
3017 while (_q < aligned_end) {
3018 unsigned long data = * (unsigned long *) _q;
3019 if (data & SWAPPED_FAST_CHAR_MASK)
3020 break;
3021 /* Zero upper bytes in UCS-4 builds */
3022#if (Py_UNICODE_SIZE > 2)
3023 _p[0] = 0;
3024 _p[1] = 0;
3025#if (SIZEOF_LONG == 8)
3026 _p[2] = 0;
3027 _p[3] = 0;
3028#endif
3029#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003030 /* Issue #4916; UCS-4 builds on big endian machines must
3031 fill the two last bytes of each 4-byte unit. */
3032#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3033# define OFF 2
3034#else
3035# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003036#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003037 ((unsigned char *) _p)[OFF + 1] = _q[0];
3038 ((unsigned char *) _p)[OFF + 0] = _q[1];
3039 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3040 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3041#if (SIZEOF_LONG == 8)
3042 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3043 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3044 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3045 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3046#endif
3047#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003048 _q += SIZEOF_LONG;
3049 _p += SIZEOF_LONG / 2;
3050 }
3051 }
3052 p = _p;
3053 q = _q;
3054 if (q >= e)
3055 break;
3056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058
Benjamin Peterson14339b62009-01-31 16:36:08 +00003059 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003060
3061 if (ch < 0xD800 || ch > 0xDFFF) {
3062 *p++ = ch;
3063 continue;
3064 }
3065
3066 /* UTF-16 code pair: */
3067 if (q > e) {
3068 errmsg = "unexpected end of data";
3069 startinpos = (((const char *)q) - 2) - starts;
3070 endinpos = ((const char *)e) + 1 - starts;
3071 goto utf16Error;
3072 }
3073 if (0xD800 <= ch && ch <= 0xDBFF) {
3074 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3075 q += 2;
3076 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003077#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 *p++ = ch;
3079 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003080#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003081 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003082#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003083 continue;
3084 }
3085 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003086 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003087 startinpos = (((const char *)q)-4)-starts;
3088 endinpos = startinpos+2;
3089 goto utf16Error;
3090 }
3091
Benjamin Peterson14339b62009-01-31 16:36:08 +00003092 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003093 errmsg = "illegal encoding";
3094 startinpos = (((const char *)q)-2)-starts;
3095 endinpos = startinpos+2;
3096 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003097
Benjamin Peterson29060642009-01-31 22:14:21 +00003098 utf16Error:
3099 outpos = p - PyUnicode_AS_UNICODE(unicode);
3100 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003101 errors,
3102 &errorHandler,
3103 "utf16", errmsg,
3104 &starts,
3105 (const char **)&e,
3106 &startinpos,
3107 &endinpos,
3108 &exc,
3109 (const char **)&q,
3110 &unicode,
3111 &outpos,
3112 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003113 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003115 /* remaining byte at the end? (size should be even) */
3116 if (e == q) {
3117 if (!consumed) {
3118 errmsg = "truncated data";
3119 startinpos = ((const char *)q) - starts;
3120 endinpos = ((const char *)e) + 1 - starts;
3121 outpos = p - PyUnicode_AS_UNICODE(unicode);
3122 if (unicode_decode_call_errorhandler(
3123 errors,
3124 &errorHandler,
3125 "utf16", errmsg,
3126 &starts,
3127 (const char **)&e,
3128 &startinpos,
3129 &endinpos,
3130 &exc,
3131 (const char **)&q,
3132 &unicode,
3133 &outpos,
3134 &p))
3135 goto onError;
3136 /* The remaining input chars are ignored if the callback
3137 chooses to skip the input */
3138 }
3139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140
3141 if (byteorder)
3142 *byteorder = bo;
3143
Walter Dörwald69652032004-09-07 20:24:22 +00003144 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003146
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003148 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 goto onError;
3150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151 Py_XDECREF(errorHandler);
3152 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 return (PyObject *)unicode;
3154
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 Py_XDECREF(errorHandler);
3158 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 return NULL;
3160}
3161
Antoine Pitrouab868312009-01-10 15:40:25 +00003162#undef FAST_CHAR_MASK
3163#undef SWAPPED_FAST_CHAR_MASK
3164
Tim Peters772747b2001-08-09 22:21:55 +00003165PyObject *
3166PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 Py_ssize_t size,
3168 const char *errors,
3169 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003171 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003172 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003173 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003174#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003175 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003176#else
3177 const int pairs = 0;
3178#endif
Tim Peters772747b2001-08-09 22:21:55 +00003179 /* Offsets from p for storing byte pairs in the right order. */
3180#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3181 int ihi = 1, ilo = 0;
3182#else
3183 int ihi = 0, ilo = 1;
3184#endif
3185
Benjamin Peterson29060642009-01-31 22:14:21 +00003186#define STORECHAR(CH) \
3187 do { \
3188 p[ihi] = ((CH) >> 8) & 0xff; \
3189 p[ilo] = (CH) & 0xff; \
3190 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003191 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003193#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003194 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 if (s[i] >= 0x10000)
3196 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003197#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003198 /* 2 * (size + pairs + (byteorder == 0)) */
3199 if (size > PY_SSIZE_T_MAX ||
3200 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003201 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003202 nsize = size + pairs + (byteorder == 0);
3203 bytesize = nsize * 2;
3204 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003206 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 if (v == NULL)
3208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003210 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003212 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003213 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003214 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003215
3216 if (byteorder == -1) {
3217 /* force LE */
3218 ihi = 1;
3219 ilo = 0;
3220 }
3221 else if (byteorder == 1) {
3222 /* force BE */
3223 ihi = 0;
3224 ilo = 1;
3225 }
3226
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003227 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 Py_UNICODE ch = *s++;
3229 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003230#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 if (ch >= 0x10000) {
3232 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3233 ch = 0xD800 | ((ch-0x10000) >> 10);
3234 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003235#endif
Tim Peters772747b2001-08-09 22:21:55 +00003236 STORECHAR(ch);
3237 if (ch2)
3238 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003239 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003240
3241 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003242 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003243#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244}
3245
3246PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3247{
3248 if (!PyUnicode_Check(unicode)) {
3249 PyErr_BadArgument();
3250 return NULL;
3251 }
3252 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003253 PyUnicode_GET_SIZE(unicode),
3254 NULL,
3255 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256}
3257
3258/* --- Unicode Escape Codec ----------------------------------------------- */
3259
Fredrik Lundh06d12682001-01-24 07:59:11 +00003260static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003261
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003263 Py_ssize_t size,
3264 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003267 Py_ssize_t startinpos;
3268 Py_ssize_t endinpos;
3269 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003274 char* message;
3275 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 PyObject *errorHandler = NULL;
3277 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003278
Guido van Rossumd57fd912000-03-10 22:53:23 +00003279 /* Escaped strings will always be longer than the resulting
3280 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 length after conversion to the true value.
3282 (but if the error callback returns a long replacement string
3283 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 v = _PyUnicode_New(size);
3285 if (v == NULL)
3286 goto onError;
3287 if (size == 0)
3288 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003292
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 while (s < end) {
3294 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003295 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297
3298 /* Non-escape characters are interpreted as Unicode ordinals */
3299 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003300 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 continue;
3302 }
3303
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 /* \ - Escapes */
3306 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003307 c = *s++;
3308 if (s > end)
3309 c = '\0'; /* Invalid after \ */
3310 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 case '\n': break;
3314 case '\\': *p++ = '\\'; break;
3315 case '\'': *p++ = '\''; break;
3316 case '\"': *p++ = '\"'; break;
3317 case 'b': *p++ = '\b'; break;
3318 case 'f': *p++ = '\014'; break; /* FF */
3319 case 't': *p++ = '\t'; break;
3320 case 'n': *p++ = '\n'; break;
3321 case 'r': *p++ = '\r'; break;
3322 case 'v': *p++ = '\013'; break; /* VT */
3323 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3324
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 case '0': case '1': case '2': case '3':
3327 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003328 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003329 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003330 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003331 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003332 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003334 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 break;
3336
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 /* hex escapes */
3338 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003340 digits = 2;
3341 message = "truncated \\xXX escape";
3342 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003346 digits = 4;
3347 message = "truncated \\uXXXX escape";
3348 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349
Benjamin Peterson29060642009-01-31 22:14:21 +00003350 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003351 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003352 digits = 8;
3353 message = "truncated \\UXXXXXXXX escape";
3354 hexescape:
3355 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356 outpos = p-PyUnicode_AS_UNICODE(v);
3357 if (s+digits>end) {
3358 endinpos = size;
3359 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003360 errors, &errorHandler,
3361 "unicodeescape", "end of string in escape sequence",
3362 &starts, &end, &startinpos, &endinpos, &exc, &s,
3363 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 goto onError;
3365 goto nextByte;
3366 }
3367 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003368 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003369 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 endinpos = (s+i+1)-starts;
3371 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 errors, &errorHandler,
3373 "unicodeescape", message,
3374 &starts, &end, &startinpos, &endinpos, &exc, &s,
3375 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003376 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003378 }
3379 chr = (chr<<4) & ~0xF;
3380 if (c >= '0' && c <= '9')
3381 chr += c - '0';
3382 else if (c >= 'a' && c <= 'f')
3383 chr += 10 + c - 'a';
3384 else
3385 chr += 10 + c - 'A';
3386 }
3387 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003388 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003389 /* _decoding_error will have already written into the
3390 target buffer. */
3391 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003392 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003393 /* when we get here, chr is a 32-bit unicode character */
3394 if (chr <= 0xffff)
3395 /* UCS-2 character */
3396 *p++ = (Py_UNICODE) chr;
3397 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003398 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003399 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003400#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003401 *p++ = chr;
3402#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003403 chr -= 0x10000L;
3404 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003405 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003406#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003407 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 endinpos = s-starts;
3409 outpos = p-PyUnicode_AS_UNICODE(v);
3410 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 errors, &errorHandler,
3412 "unicodeescape", "illegal Unicode character",
3413 &starts, &end, &startinpos, &endinpos, &exc, &s,
3414 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003415 goto onError;
3416 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003417 break;
3418
Benjamin Peterson29060642009-01-31 22:14:21 +00003419 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003420 case 'N':
3421 message = "malformed \\N character escape";
3422 if (ucnhash_CAPI == NULL) {
3423 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003424 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003425 if (ucnhash_CAPI == NULL)
3426 goto ucnhashError;
3427 }
3428 if (*s == '{') {
3429 const char *start = s+1;
3430 /* look for the closing brace */
3431 while (*s != '}' && s < end)
3432 s++;
3433 if (s > start && s < end && *s == '}') {
3434 /* found a name. look it up in the unicode database */
3435 message = "unknown Unicode character name";
3436 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003437 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003438 goto store;
3439 }
3440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003441 endinpos = s-starts;
3442 outpos = p-PyUnicode_AS_UNICODE(v);
3443 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003444 errors, &errorHandler,
3445 "unicodeescape", message,
3446 &starts, &end, &startinpos, &endinpos, &exc, &s,
3447 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003448 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003449 break;
3450
3451 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003452 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 message = "\\ at end of string";
3454 s--;
3455 endinpos = s-starts;
3456 outpos = p-PyUnicode_AS_UNICODE(v);
3457 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003458 errors, &errorHandler,
3459 "unicodeescape", message,
3460 &starts, &end, &startinpos, &endinpos, &exc, &s,
3461 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003462 goto onError;
3463 }
3464 else {
3465 *p++ = '\\';
3466 *p++ = (unsigned char)s[-1];
3467 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003468 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003473 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003475 Py_XDECREF(errorHandler);
3476 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003477 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003478
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003480 PyErr_SetString(
3481 PyExc_UnicodeError,
3482 "\\N escapes not supported (can't load unicodedata module)"
3483 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003484 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485 Py_XDECREF(errorHandler);
3486 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003487 return NULL;
3488
Benjamin Peterson29060642009-01-31 22:14:21 +00003489 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 Py_XDECREF(errorHandler);
3492 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 return NULL;
3494}
3495
3496/* Return a Unicode-Escape string version of the Unicode object.
3497
3498 If quotes is true, the string is enclosed in u"" or u'' quotes as
3499 appropriate.
3500
3501*/
3502
Thomas Wouters477c8d52006-05-27 19:21:47 +00003503Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 Py_ssize_t size,
3505 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003506{
3507 /* like wcschr, but doesn't stop at NULL characters */
3508
3509 while (size-- > 0) {
3510 if (*s == ch)
3511 return s;
3512 s++;
3513 }
3514
3515 return NULL;
3516}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003517
Walter Dörwald79e913e2007-05-12 11:08:06 +00003518static const char *hexdigits = "0123456789abcdef";
3519
3520PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003523 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003526#ifdef Py_UNICODE_WIDE
3527 const Py_ssize_t expandsize = 10;
3528#else
3529 const Py_ssize_t expandsize = 6;
3530#endif
3531
Thomas Wouters89f507f2006-12-13 04:49:30 +00003532 /* XXX(nnorwitz): rather than over-allocating, it would be
3533 better to choose a different scheme. Perhaps scan the
3534 first N-chars of the string and allocate based on that size.
3535 */
3536 /* Initial allocation is based on the longest-possible unichr
3537 escape.
3538
3539 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3540 unichr, so in this case it's the longest unichr escape. In
3541 narrow (UTF-16) builds this is five chars per source unichr
3542 since there are two unichrs in the surrogate pair, so in narrow
3543 (UTF-16) builds it's not the longest unichr escape.
3544
3545 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3546 so in the narrow (UTF-16) build case it's the longest unichr
3547 escape.
3548 */
3549
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003550 if (size == 0)
3551 return PyBytes_FromStringAndSize(NULL, 0);
3552
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003553 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003555
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003556 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 2
3558 + expandsize*size
3559 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 if (repr == NULL)
3561 return NULL;
3562
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003563 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 while (size-- > 0) {
3566 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003567
Walter Dörwald79e913e2007-05-12 11:08:06 +00003568 /* Escape backslashes */
3569 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 *p++ = '\\';
3571 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003572 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003573 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003574
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003575#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003576 /* Map 21-bit characters to '\U00xxxxxx' */
3577 else if (ch >= 0x10000) {
3578 *p++ = '\\';
3579 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003580 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3581 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3582 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3583 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3584 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3585 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3586 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3587 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003588 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003589 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003590#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3592 else if (ch >= 0xD800 && ch < 0xDC00) {
3593 Py_UNICODE ch2;
3594 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003595
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 ch2 = *s++;
3597 size--;
Georg Brandl0f147092010-08-01 20:54:22 +00003598 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3600 *p++ = '\\';
3601 *p++ = 'U';
3602 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3603 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3604 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3605 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3606 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3607 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3608 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3609 *p++ = hexdigits[ucs & 0x0000000F];
3610 continue;
3611 }
3612 /* Fall through: isolated surrogates are copied as-is */
3613 s--;
3614 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003615 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003616#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003617
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003619 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 *p++ = '\\';
3621 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003622 *p++ = hexdigits[(ch >> 12) & 0x000F];
3623 *p++ = hexdigits[(ch >> 8) & 0x000F];
3624 *p++ = hexdigits[(ch >> 4) & 0x000F];
3625 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003627
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003628 /* Map special whitespace to '\t', \n', '\r' */
3629 else if (ch == '\t') {
3630 *p++ = '\\';
3631 *p++ = 't';
3632 }
3633 else if (ch == '\n') {
3634 *p++ = '\\';
3635 *p++ = 'n';
3636 }
3637 else if (ch == '\r') {
3638 *p++ = '\\';
3639 *p++ = 'r';
3640 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003641
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003642 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003643 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003645 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003646 *p++ = hexdigits[(ch >> 4) & 0x000F];
3647 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003648 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003649
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 /* Copy everything else as-is */
3651 else
3652 *p++ = (char) ch;
3653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003655 assert(p - PyBytes_AS_STRING(repr) > 0);
3656 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3657 return NULL;
3658 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659}
3660
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003661PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003663 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 if (!PyUnicode_Check(unicode)) {
3665 PyErr_BadArgument();
3666 return NULL;
3667 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003668 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3669 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003670 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671}
3672
3673/* --- Raw Unicode Escape Codec ------------------------------------------- */
3674
3675PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 Py_ssize_t size,
3677 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003680 Py_ssize_t startinpos;
3681 Py_ssize_t endinpos;
3682 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 const char *end;
3686 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 PyObject *errorHandler = NULL;
3688 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003689
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 /* Escaped strings will always be longer than the resulting
3691 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 length after conversion to the true value. (But decoding error
3693 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 v = _PyUnicode_New(size);
3695 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003696 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 end = s + size;
3701 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 unsigned char c;
3703 Py_UCS4 x;
3704 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003705 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 /* Non-escape characters are interpreted as Unicode ordinals */
3708 if (*s != '\\') {
3709 *p++ = (unsigned char)*s++;
3710 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003711 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 startinpos = s-starts;
3713
3714 /* \u-escapes are only interpreted iff the number of leading
3715 backslashes if odd */
3716 bs = s;
3717 for (;s < end;) {
3718 if (*s != '\\')
3719 break;
3720 *p++ = (unsigned char)*s++;
3721 }
3722 if (((s - bs) & 1) == 0 ||
3723 s >= end ||
3724 (*s != 'u' && *s != 'U')) {
3725 continue;
3726 }
3727 p--;
3728 count = *s=='u' ? 4 : 8;
3729 s++;
3730
3731 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3732 outpos = p-PyUnicode_AS_UNICODE(v);
3733 for (x = 0, i = 0; i < count; ++i, ++s) {
3734 c = (unsigned char)*s;
3735 if (!ISXDIGIT(c)) {
3736 endinpos = s-starts;
3737 if (unicode_decode_call_errorhandler(
3738 errors, &errorHandler,
3739 "rawunicodeescape", "truncated \\uXXXX",
3740 &starts, &end, &startinpos, &endinpos, &exc, &s,
3741 &v, &outpos, &p))
3742 goto onError;
3743 goto nextByte;
3744 }
3745 x = (x<<4) & ~0xF;
3746 if (c >= '0' && c <= '9')
3747 x += c - '0';
3748 else if (c >= 'a' && c <= 'f')
3749 x += 10 + c - 'a';
3750 else
3751 x += 10 + c - 'A';
3752 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003753 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 /* UCS-2 character */
3755 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003756 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 /* UCS-4 character. Either store directly, or as
3758 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003759#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003760 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003761#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003762 x -= 0x10000L;
3763 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3764 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003765#endif
3766 } else {
3767 endinpos = s-starts;
3768 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003769 if (unicode_decode_call_errorhandler(
3770 errors, &errorHandler,
3771 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003772 &starts, &end, &startinpos, &endinpos, &exc, &s,
3773 &v, &outpos, &p))
3774 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003775 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003776 nextByte:
3777 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003779 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003780 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 Py_XDECREF(errorHandler);
3782 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003784
Benjamin Peterson29060642009-01-31 22:14:21 +00003785 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 Py_XDECREF(errorHandler);
3788 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 return NULL;
3790}
3791
3792PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003793 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003795 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 char *p;
3797 char *q;
3798
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003799#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003800 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003801#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003802 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003803#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003804
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003805 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003806 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003807
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003808 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 if (repr == NULL)
3810 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003811 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003812 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003814 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 while (size-- > 0) {
3816 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003817#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 /* Map 32-bit characters to '\Uxxxxxxxx' */
3819 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003820 *p++ = '\\';
3821 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003822 *p++ = hexdigits[(ch >> 28) & 0xf];
3823 *p++ = hexdigits[(ch >> 24) & 0xf];
3824 *p++ = hexdigits[(ch >> 20) & 0xf];
3825 *p++ = hexdigits[(ch >> 16) & 0xf];
3826 *p++ = hexdigits[(ch >> 12) & 0xf];
3827 *p++ = hexdigits[(ch >> 8) & 0xf];
3828 *p++ = hexdigits[(ch >> 4) & 0xf];
3829 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003830 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003831 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003832#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3834 if (ch >= 0xD800 && ch < 0xDC00) {
3835 Py_UNICODE ch2;
3836 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003837
Benjamin Peterson29060642009-01-31 22:14:21 +00003838 ch2 = *s++;
3839 size--;
Georg Brandl0f147092010-08-01 20:54:22 +00003840 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003841 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3842 *p++ = '\\';
3843 *p++ = 'U';
3844 *p++ = hexdigits[(ucs >> 28) & 0xf];
3845 *p++ = hexdigits[(ucs >> 24) & 0xf];
3846 *p++ = hexdigits[(ucs >> 20) & 0xf];
3847 *p++ = hexdigits[(ucs >> 16) & 0xf];
3848 *p++ = hexdigits[(ucs >> 12) & 0xf];
3849 *p++ = hexdigits[(ucs >> 8) & 0xf];
3850 *p++ = hexdigits[(ucs >> 4) & 0xf];
3851 *p++ = hexdigits[ucs & 0xf];
3852 continue;
3853 }
3854 /* Fall through: isolated surrogates are copied as-is */
3855 s--;
3856 size++;
3857 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003858#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 /* Map 16-bit characters to '\uxxxx' */
3860 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 *p++ = '\\';
3862 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003863 *p++ = hexdigits[(ch >> 12) & 0xf];
3864 *p++ = hexdigits[(ch >> 8) & 0xf];
3865 *p++ = hexdigits[(ch >> 4) & 0xf];
3866 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003868 /* Copy everything else as-is */
3869 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 *p++ = (char) ch;
3871 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003872 size = p - q;
3873
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003874 assert(size > 0);
3875 if (_PyBytes_Resize(&repr, size) < 0)
3876 return NULL;
3877 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878}
3879
3880PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3881{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003882 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003884 PyErr_BadArgument();
3885 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003887 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3888 PyUnicode_GET_SIZE(unicode));
3889
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003890 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891}
3892
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003893/* --- Unicode Internal Codec ------------------------------------------- */
3894
3895PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003896 Py_ssize_t size,
3897 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003898{
3899 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003900 Py_ssize_t startinpos;
3901 Py_ssize_t endinpos;
3902 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003903 PyUnicodeObject *v;
3904 Py_UNICODE *p;
3905 const char *end;
3906 const char *reason;
3907 PyObject *errorHandler = NULL;
3908 PyObject *exc = NULL;
3909
Neal Norwitzd43069c2006-01-08 01:12:10 +00003910#ifdef Py_UNICODE_WIDE
3911 Py_UNICODE unimax = PyUnicode_GetMax();
3912#endif
3913
Thomas Wouters89f507f2006-12-13 04:49:30 +00003914 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003915 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3916 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003918 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003920 p = PyUnicode_AS_UNICODE(v);
3921 end = s + size;
3922
3923 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003924 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003925 /* We have to sanity check the raw data, otherwise doom looms for
3926 some malformed UCS-4 data. */
3927 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003928#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003929 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003930#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003931 end-s < Py_UNICODE_SIZE
3932 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003933 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003934 startinpos = s - starts;
3935 if (end-s < Py_UNICODE_SIZE) {
3936 endinpos = end-starts;
3937 reason = "truncated input";
3938 }
3939 else {
3940 endinpos = s - starts + Py_UNICODE_SIZE;
3941 reason = "illegal code point (> 0x10FFFF)";
3942 }
3943 outpos = p - PyUnicode_AS_UNICODE(v);
3944 if (unicode_decode_call_errorhandler(
3945 errors, &errorHandler,
3946 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003947 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003948 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003949 goto onError;
3950 }
3951 }
3952 else {
3953 p++;
3954 s += Py_UNICODE_SIZE;
3955 }
3956 }
3957
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003958 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003959 goto onError;
3960 Py_XDECREF(errorHandler);
3961 Py_XDECREF(exc);
3962 return (PyObject *)v;
3963
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003965 Py_XDECREF(v);
3966 Py_XDECREF(errorHandler);
3967 Py_XDECREF(exc);
3968 return NULL;
3969}
3970
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971/* --- Latin-1 Codec ------------------------------------------------------ */
3972
3973PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 Py_ssize_t size,
3975 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976{
3977 PyUnicodeObject *v;
3978 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003979 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003980
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003982 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 Py_UNICODE r = *(unsigned char*)s;
3984 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003985 }
3986
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 v = _PyUnicode_New(size);
3988 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003989 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003993 e = s + size;
3994 /* Unrolling the copy makes it much faster by reducing the looping
3995 overhead. This is similar to what many memcpy() implementations do. */
3996 unrolled_end = e - 4;
3997 while (s < unrolled_end) {
3998 p[0] = (unsigned char) s[0];
3999 p[1] = (unsigned char) s[1];
4000 p[2] = (unsigned char) s[2];
4001 p[3] = (unsigned char) s[3];
4002 s += 4;
4003 p += 4;
4004 }
4005 while (s < e)
4006 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004008
Benjamin Peterson29060642009-01-31 22:14:21 +00004009 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 Py_XDECREF(v);
4011 return NULL;
4012}
4013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014/* create or adjust a UnicodeEncodeError */
4015static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 const char *encoding,
4017 const Py_UNICODE *unicode, Py_ssize_t size,
4018 Py_ssize_t startpos, Py_ssize_t endpos,
4019 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004022 *exceptionObject = PyUnicodeEncodeError_Create(
4023 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 }
4025 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4027 goto onError;
4028 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4029 goto onError;
4030 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4031 goto onError;
4032 return;
4033 onError:
4034 Py_DECREF(*exceptionObject);
4035 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 }
4037}
4038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039/* raises a UnicodeEncodeError */
4040static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 const char *encoding,
4042 const Py_UNICODE *unicode, Py_ssize_t size,
4043 Py_ssize_t startpos, Py_ssize_t endpos,
4044 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045{
4046 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050}
4051
4052/* error handling callback helper:
4053 build arguments, call the callback and check the arguments,
4054 put the result into newpos and return the replacement string, which
4055 has to be freed by the caller */
4056static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 PyObject **errorHandler,
4058 const char *encoding, const char *reason,
4059 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4060 Py_ssize_t startpos, Py_ssize_t endpos,
4061 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004063 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064
4065 PyObject *restuple;
4066 PyObject *resunicode;
4067
4068 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004071 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 }
4073
4074 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078
4079 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004084 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 Py_DECREF(restuple);
4086 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004088 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004089 &resunicode, newpos)) {
4090 Py_DECREF(restuple);
4091 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004093 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4094 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4095 Py_DECREF(restuple);
4096 return NULL;
4097 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004099 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004100 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4102 Py_DECREF(restuple);
4103 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004105 Py_INCREF(resunicode);
4106 Py_DECREF(restuple);
4107 return resunicode;
4108}
4109
4110static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 Py_ssize_t size,
4112 const char *errors,
4113 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114{
4115 /* output object */
4116 PyObject *res;
4117 /* pointers to the beginning and end+1 of input */
4118 const Py_UNICODE *startp = p;
4119 const Py_UNICODE *endp = p + size;
4120 /* pointer to the beginning of the unencodable characters */
4121 /* const Py_UNICODE *badp = NULL; */
4122 /* pointer into the output */
4123 char *str;
4124 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004125 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004126 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4127 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 PyObject *errorHandler = NULL;
4129 PyObject *exc = NULL;
4130 /* the following variable is used for caching string comparisons
4131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4132 int known_errorHandler = -1;
4133
4134 /* allocate enough for a simple encoding without
4135 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004136 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004137 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004138 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004140 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004141 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 ressize = size;
4143
4144 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 /* can we encode this? */
4148 if (c<limit) {
4149 /* no overflow check, because we know that the space is enough */
4150 *str++ = (char)c;
4151 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004152 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 else {
4154 Py_ssize_t unicodepos = p-startp;
4155 Py_ssize_t requiredsize;
4156 PyObject *repunicode;
4157 Py_ssize_t repsize;
4158 Py_ssize_t newpos;
4159 Py_ssize_t respos;
4160 Py_UNICODE *uni2;
4161 /* startpos for collecting unencodable chars */
4162 const Py_UNICODE *collstart = p;
4163 const Py_UNICODE *collend = p;
4164 /* find all unecodable characters */
4165 while ((collend < endp) && ((*collend)>=limit))
4166 ++collend;
4167 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4168 if (known_errorHandler==-1) {
4169 if ((errors==NULL) || (!strcmp(errors, "strict")))
4170 known_errorHandler = 1;
4171 else if (!strcmp(errors, "replace"))
4172 known_errorHandler = 2;
4173 else if (!strcmp(errors, "ignore"))
4174 known_errorHandler = 3;
4175 else if (!strcmp(errors, "xmlcharrefreplace"))
4176 known_errorHandler = 4;
4177 else
4178 known_errorHandler = 0;
4179 }
4180 switch (known_errorHandler) {
4181 case 1: /* strict */
4182 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4183 goto onError;
4184 case 2: /* replace */
4185 while (collstart++<collend)
4186 *str++ = '?'; /* fall through */
4187 case 3: /* ignore */
4188 p = collend;
4189 break;
4190 case 4: /* xmlcharrefreplace */
4191 respos = str - PyBytes_AS_STRING(res);
4192 /* determine replacement size (temporarily (mis)uses p) */
4193 for (p = collstart, repsize = 0; p < collend; ++p) {
4194 if (*p<10)
4195 repsize += 2+1+1;
4196 else if (*p<100)
4197 repsize += 2+2+1;
4198 else if (*p<1000)
4199 repsize += 2+3+1;
4200 else if (*p<10000)
4201 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004202#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 else
4204 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004205#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 else if (*p<100000)
4207 repsize += 2+5+1;
4208 else if (*p<1000000)
4209 repsize += 2+6+1;
4210 else
4211 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004212#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 }
4214 requiredsize = respos+repsize+(endp-collend);
4215 if (requiredsize > ressize) {
4216 if (requiredsize<2*ressize)
4217 requiredsize = 2*ressize;
4218 if (_PyBytes_Resize(&res, requiredsize))
4219 goto onError;
4220 str = PyBytes_AS_STRING(res) + respos;
4221 ressize = requiredsize;
4222 }
4223 /* generate replacement (temporarily (mis)uses p) */
4224 for (p = collstart; p < collend; ++p) {
4225 str += sprintf(str, "&#%d;", (int)*p);
4226 }
4227 p = collend;
4228 break;
4229 default:
4230 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4231 encoding, reason, startp, size, &exc,
4232 collstart-startp, collend-startp, &newpos);
4233 if (repunicode == NULL)
4234 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004235 if (PyBytes_Check(repunicode)) {
4236 /* Directly copy bytes result to output. */
4237 repsize = PyBytes_Size(repunicode);
4238 if (repsize > 1) {
4239 /* Make room for all additional bytes. */
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00004240 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004241 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4242 Py_DECREF(repunicode);
4243 goto onError;
4244 }
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00004245 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004246 ressize += repsize-1;
4247 }
4248 memcpy(str, PyBytes_AsString(repunicode), repsize);
4249 str += repsize;
4250 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004251 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004252 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004253 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 /* need more space? (at least enough for what we
4255 have+the replacement+the rest of the string, so
4256 we won't have to check space for encodable characters) */
4257 respos = str - PyBytes_AS_STRING(res);
4258 repsize = PyUnicode_GET_SIZE(repunicode);
4259 requiredsize = respos+repsize+(endp-collend);
4260 if (requiredsize > ressize) {
4261 if (requiredsize<2*ressize)
4262 requiredsize = 2*ressize;
4263 if (_PyBytes_Resize(&res, requiredsize)) {
4264 Py_DECREF(repunicode);
4265 goto onError;
4266 }
4267 str = PyBytes_AS_STRING(res) + respos;
4268 ressize = requiredsize;
4269 }
4270 /* check if there is anything unencodable in the replacement
4271 and copy it to the output */
4272 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4273 c = *uni2;
4274 if (c >= limit) {
4275 raise_encode_exception(&exc, encoding, startp, size,
4276 unicodepos, unicodepos+1, reason);
4277 Py_DECREF(repunicode);
4278 goto onError;
4279 }
4280 *str = (char)c;
4281 }
4282 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004283 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004285 }
4286 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004287 /* Resize if we allocated to much */
4288 size = str - PyBytes_AS_STRING(res);
4289 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004290 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004291 if (_PyBytes_Resize(&res, size) < 0)
4292 goto onError;
4293 }
4294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004297 return res;
4298
4299 onError:
4300 Py_XDECREF(res);
4301 Py_XDECREF(errorHandler);
4302 Py_XDECREF(exc);
4303 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304}
4305
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 Py_ssize_t size,
4308 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311}
4312
4313PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4314{
4315 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 PyErr_BadArgument();
4317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 }
4319 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 PyUnicode_GET_SIZE(unicode),
4321 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322}
4323
4324/* --- 7-bit ASCII Codec -------------------------------------------------- */
4325
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 Py_ssize_t size,
4328 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 PyUnicodeObject *v;
4332 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004333 Py_ssize_t startinpos;
4334 Py_ssize_t endinpos;
4335 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 const char *e;
4337 PyObject *errorHandler = NULL;
4338 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004339
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004341 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 Py_UNICODE r = *(unsigned char*)s;
4343 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004344 }
Tim Petersced69f82003-09-16 20:30:58 +00004345
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346 v = _PyUnicode_New(size);
4347 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004350 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352 e = s + size;
4353 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 register unsigned char c = (unsigned char)*s;
4355 if (c < 128) {
4356 *p++ = c;
4357 ++s;
4358 }
4359 else {
4360 startinpos = s-starts;
4361 endinpos = startinpos + 1;
4362 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4363 if (unicode_decode_call_errorhandler(
4364 errors, &errorHandler,
4365 "ascii", "ordinal not in range(128)",
4366 &starts, &e, &startinpos, &endinpos, &exc, &s,
4367 &v, &outpos, &p))
4368 goto onError;
4369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004371 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4373 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374 Py_XDECREF(errorHandler);
4375 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004377
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 Py_XDECREF(errorHandler);
4381 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 return NULL;
4383}
4384
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 Py_ssize_t size,
4387 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390}
4391
4392PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4393{
4394 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 PyErr_BadArgument();
4396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 }
4398 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 PyUnicode_GET_SIZE(unicode),
4400 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401}
4402
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004403#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004404
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004405/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004406
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004407#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004408#define NEED_RETRY
4409#endif
4410
4411/* XXX This code is limited to "true" double-byte encodings, as
4412 a) it assumes an incomplete character consists of a single byte, and
4413 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004415
4416static int is_dbcs_lead_byte(const char *s, int offset)
4417{
4418 const char *curr = s + offset;
4419
4420 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004421 const char *prev = CharPrev(s, curr);
4422 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004423 }
4424 return 0;
4425}
4426
4427/*
4428 * Decode MBCS string into unicode object. If 'final' is set, converts
4429 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4430 */
4431static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 const char *s, /* MBCS string */
4433 int size, /* sizeof MBCS string */
4434 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004435{
4436 Py_UNICODE *p;
4437 Py_ssize_t n = 0;
4438 int usize = 0;
4439
4440 assert(size >= 0);
4441
4442 /* Skip trailing lead-byte unless 'final' is set */
4443 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004445
4446 /* First get the size of the result */
4447 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4449 if (usize == 0) {
4450 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4451 return -1;
4452 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004453 }
4454
4455 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 /* Create unicode object */
4457 *v = _PyUnicode_New(usize);
4458 if (*v == NULL)
4459 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004460 }
4461 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 /* Extend unicode object */
4463 n = PyUnicode_GET_SIZE(*v);
4464 if (_PyUnicode_Resize(v, n + usize) < 0)
4465 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004466 }
4467
4468 /* Do the conversion */
4469 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004470 p = PyUnicode_AS_UNICODE(*v) + n;
4471 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4472 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4473 return -1;
4474 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004475 }
4476
4477 return size;
4478}
4479
4480PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 Py_ssize_t size,
4482 const char *errors,
4483 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004484{
4485 PyUnicodeObject *v = NULL;
4486 int done;
4487
4488 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004490
4491#ifdef NEED_RETRY
4492 retry:
4493 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004495 else
4496#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004498
4499 if (done < 0) {
4500 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004502 }
4503
4504 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004506
4507#ifdef NEED_RETRY
4508 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 s += done;
4510 size -= done;
4511 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004512 }
4513#endif
4514
4515 return (PyObject *)v;
4516}
4517
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004518PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 Py_ssize_t size,
4520 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004521{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004522 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4523}
4524
4525/*
4526 * Convert unicode into string object (MBCS).
4527 * Returns 0 if succeed, -1 otherwise.
4528 */
4529static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 const Py_UNICODE *p, /* unicode */
4531 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004532{
4533 int mbcssize = 0;
4534 Py_ssize_t n = 0;
4535
4536 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004537
4538 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004539 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4541 if (mbcssize == 0) {
4542 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4543 return -1;
4544 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004545 }
4546
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004547 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 /* Create string object */
4549 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4550 if (*repr == NULL)
4551 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004552 }
4553 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 /* Extend string object */
4555 n = PyBytes_Size(*repr);
4556 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4557 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004558 }
4559
4560 /* Do the conversion */
4561 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 char *s = PyBytes_AS_STRING(*repr) + n;
4563 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4564 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4565 return -1;
4566 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004567 }
4568
4569 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004570}
4571
4572PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004573 Py_ssize_t size,
4574 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004575{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004576 PyObject *repr = NULL;
4577 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004578
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004579#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004581 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004583 else
4584#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004586
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004587 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 Py_XDECREF(repr);
4589 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004590 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004591
4592#ifdef NEED_RETRY
4593 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 p += INT_MAX;
4595 size -= INT_MAX;
4596 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004597 }
4598#endif
4599
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004600 return repr;
4601}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004602
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004603PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4604{
4605 if (!PyUnicode_Check(unicode)) {
4606 PyErr_BadArgument();
4607 return NULL;
4608 }
4609 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 PyUnicode_GET_SIZE(unicode),
4611 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004612}
4613
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004614#undef NEED_RETRY
4615
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004616#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004617
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618/* --- Character Mapping Codec -------------------------------------------- */
4619
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 Py_ssize_t size,
4622 PyObject *mapping,
4623 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004626 Py_ssize_t startinpos;
4627 Py_ssize_t endinpos;
4628 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 PyUnicodeObject *v;
4631 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004632 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 PyObject *errorHandler = NULL;
4634 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004635 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004636 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004637
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 /* Default to Latin-1 */
4639 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641
4642 v = _PyUnicode_New(size);
4643 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004649 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 mapstring = PyUnicode_AS_UNICODE(mapping);
4651 maplen = PyUnicode_GET_SIZE(mapping);
4652 while (s < e) {
4653 unsigned char ch = *s;
4654 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 if (ch < maplen)
4657 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658
Benjamin Peterson29060642009-01-31 22:14:21 +00004659 if (x == 0xfffe) {
4660 /* undefined mapping */
4661 outpos = p-PyUnicode_AS_UNICODE(v);
4662 startinpos = s-starts;
4663 endinpos = startinpos+1;
4664 if (unicode_decode_call_errorhandler(
4665 errors, &errorHandler,
4666 "charmap", "character maps to <undefined>",
4667 &starts, &e, &startinpos, &endinpos, &exc, &s,
4668 &v, &outpos, &p)) {
4669 goto onError;
4670 }
4671 continue;
4672 }
4673 *p++ = x;
4674 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004675 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004676 }
4677 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 while (s < e) {
4679 unsigned char ch = *s;
4680 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004681
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4683 w = PyLong_FromLong((long)ch);
4684 if (w == NULL)
4685 goto onError;
4686 x = PyObject_GetItem(mapping, w);
4687 Py_DECREF(w);
4688 if (x == NULL) {
4689 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4690 /* No mapping found means: mapping is undefined. */
4691 PyErr_Clear();
4692 x = Py_None;
4693 Py_INCREF(x);
4694 } else
4695 goto onError;
4696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004697
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 /* Apply mapping */
4699 if (PyLong_Check(x)) {
4700 long value = PyLong_AS_LONG(x);
4701 if (value < 0 || value > 65535) {
4702 PyErr_SetString(PyExc_TypeError,
4703 "character mapping must be in range(65536)");
4704 Py_DECREF(x);
4705 goto onError;
4706 }
4707 *p++ = (Py_UNICODE)value;
4708 }
4709 else if (x == Py_None) {
4710 /* undefined mapping */
4711 outpos = p-PyUnicode_AS_UNICODE(v);
4712 startinpos = s-starts;
4713 endinpos = startinpos+1;
4714 if (unicode_decode_call_errorhandler(
4715 errors, &errorHandler,
4716 "charmap", "character maps to <undefined>",
4717 &starts, &e, &startinpos, &endinpos, &exc, &s,
4718 &v, &outpos, &p)) {
4719 Py_DECREF(x);
4720 goto onError;
4721 }
4722 Py_DECREF(x);
4723 continue;
4724 }
4725 else if (PyUnicode_Check(x)) {
4726 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004727
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 if (targetsize == 1)
4729 /* 1-1 mapping */
4730 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004731
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 else if (targetsize > 1) {
4733 /* 1-n mapping */
4734 if (targetsize > extrachars) {
4735 /* resize first */
4736 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4737 Py_ssize_t needed = (targetsize - extrachars) + \
4738 (targetsize << 2);
4739 extrachars += needed;
4740 /* XXX overflow detection missing */
4741 if (_PyUnicode_Resize(&v,
4742 PyUnicode_GET_SIZE(v) + needed) < 0) {
4743 Py_DECREF(x);
4744 goto onError;
4745 }
4746 p = PyUnicode_AS_UNICODE(v) + oldpos;
4747 }
4748 Py_UNICODE_COPY(p,
4749 PyUnicode_AS_UNICODE(x),
4750 targetsize);
4751 p += targetsize;
4752 extrachars -= targetsize;
4753 }
4754 /* 1-0 mapping: skip the character */
4755 }
4756 else {
4757 /* wrong return value */
4758 PyErr_SetString(PyExc_TypeError,
4759 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004760 Py_DECREF(x);
4761 goto onError;
4762 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004763 Py_DECREF(x);
4764 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 }
4767 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004768 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4769 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 Py_XDECREF(errorHandler);
4771 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004773
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775 Py_XDECREF(errorHandler);
4776 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 Py_XDECREF(v);
4778 return NULL;
4779}
4780
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004781/* Charmap encoding: the lookup table */
4782
4783struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 PyObject_HEAD
4785 unsigned char level1[32];
4786 int count2, count3;
4787 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004788};
4789
4790static PyObject*
4791encoding_map_size(PyObject *obj, PyObject* args)
4792{
4793 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004794 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004796}
4797
4798static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004799 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 PyDoc_STR("Return the size (in bytes) of this object") },
4801 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004802};
4803
4804static void
4805encoding_map_dealloc(PyObject* o)
4806{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004807 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004808}
4809
4810static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004811 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 "EncodingMap", /*tp_name*/
4813 sizeof(struct encoding_map), /*tp_basicsize*/
4814 0, /*tp_itemsize*/
4815 /* methods */
4816 encoding_map_dealloc, /*tp_dealloc*/
4817 0, /*tp_print*/
4818 0, /*tp_getattr*/
4819 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004820 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 0, /*tp_repr*/
4822 0, /*tp_as_number*/
4823 0, /*tp_as_sequence*/
4824 0, /*tp_as_mapping*/
4825 0, /*tp_hash*/
4826 0, /*tp_call*/
4827 0, /*tp_str*/
4828 0, /*tp_getattro*/
4829 0, /*tp_setattro*/
4830 0, /*tp_as_buffer*/
4831 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4832 0, /*tp_doc*/
4833 0, /*tp_traverse*/
4834 0, /*tp_clear*/
4835 0, /*tp_richcompare*/
4836 0, /*tp_weaklistoffset*/
4837 0, /*tp_iter*/
4838 0, /*tp_iternext*/
4839 encoding_map_methods, /*tp_methods*/
4840 0, /*tp_members*/
4841 0, /*tp_getset*/
4842 0, /*tp_base*/
4843 0, /*tp_dict*/
4844 0, /*tp_descr_get*/
4845 0, /*tp_descr_set*/
4846 0, /*tp_dictoffset*/
4847 0, /*tp_init*/
4848 0, /*tp_alloc*/
4849 0, /*tp_new*/
4850 0, /*tp_free*/
4851 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004852};
4853
4854PyObject*
4855PyUnicode_BuildEncodingMap(PyObject* string)
4856{
4857 Py_UNICODE *decode;
4858 PyObject *result;
4859 struct encoding_map *mresult;
4860 int i;
4861 int need_dict = 0;
4862 unsigned char level1[32];
4863 unsigned char level2[512];
4864 unsigned char *mlevel1, *mlevel2, *mlevel3;
4865 int count2 = 0, count3 = 0;
4866
4867 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4868 PyErr_BadArgument();
4869 return NULL;
4870 }
4871 decode = PyUnicode_AS_UNICODE(string);
4872 memset(level1, 0xFF, sizeof level1);
4873 memset(level2, 0xFF, sizeof level2);
4874
4875 /* If there isn't a one-to-one mapping of NULL to \0,
4876 or if there are non-BMP characters, we need to use
4877 a mapping dictionary. */
4878 if (decode[0] != 0)
4879 need_dict = 1;
4880 for (i = 1; i < 256; i++) {
4881 int l1, l2;
4882 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004883#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004884 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004885#endif
4886 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004887 need_dict = 1;
4888 break;
4889 }
4890 if (decode[i] == 0xFFFE)
4891 /* unmapped character */
4892 continue;
4893 l1 = decode[i] >> 11;
4894 l2 = decode[i] >> 7;
4895 if (level1[l1] == 0xFF)
4896 level1[l1] = count2++;
4897 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004898 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004899 }
4900
4901 if (count2 >= 0xFF || count3 >= 0xFF)
4902 need_dict = 1;
4903
4904 if (need_dict) {
4905 PyObject *result = PyDict_New();
4906 PyObject *key, *value;
4907 if (!result)
4908 return NULL;
4909 for (i = 0; i < 256; i++) {
4910 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004911 key = PyLong_FromLong(decode[i]);
4912 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004913 if (!key || !value)
4914 goto failed1;
4915 if (PyDict_SetItem(result, key, value) == -1)
4916 goto failed1;
4917 Py_DECREF(key);
4918 Py_DECREF(value);
4919 }
4920 return result;
4921 failed1:
4922 Py_XDECREF(key);
4923 Py_XDECREF(value);
4924 Py_DECREF(result);
4925 return NULL;
4926 }
4927
4928 /* Create a three-level trie */
4929 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4930 16*count2 + 128*count3 - 1);
4931 if (!result)
4932 return PyErr_NoMemory();
4933 PyObject_Init(result, &EncodingMapType);
4934 mresult = (struct encoding_map*)result;
4935 mresult->count2 = count2;
4936 mresult->count3 = count3;
4937 mlevel1 = mresult->level1;
4938 mlevel2 = mresult->level23;
4939 mlevel3 = mresult->level23 + 16*count2;
4940 memcpy(mlevel1, level1, 32);
4941 memset(mlevel2, 0xFF, 16*count2);
4942 memset(mlevel3, 0, 128*count3);
4943 count3 = 0;
4944 for (i = 1; i < 256; i++) {
4945 int o1, o2, o3, i2, i3;
4946 if (decode[i] == 0xFFFE)
4947 /* unmapped character */
4948 continue;
4949 o1 = decode[i]>>11;
4950 o2 = (decode[i]>>7) & 0xF;
4951 i2 = 16*mlevel1[o1] + o2;
4952 if (mlevel2[i2] == 0xFF)
4953 mlevel2[i2] = count3++;
4954 o3 = decode[i] & 0x7F;
4955 i3 = 128*mlevel2[i2] + o3;
4956 mlevel3[i3] = i;
4957 }
4958 return result;
4959}
4960
4961static int
4962encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4963{
4964 struct encoding_map *map = (struct encoding_map*)mapping;
4965 int l1 = c>>11;
4966 int l2 = (c>>7) & 0xF;
4967 int l3 = c & 0x7F;
4968 int i;
4969
4970#ifdef Py_UNICODE_WIDE
4971 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004973 }
4974#endif
4975 if (c == 0)
4976 return 0;
4977 /* level 1*/
4978 i = map->level1[l1];
4979 if (i == 0xFF) {
4980 return -1;
4981 }
4982 /* level 2*/
4983 i = map->level23[16*i+l2];
4984 if (i == 0xFF) {
4985 return -1;
4986 }
4987 /* level 3 */
4988 i = map->level23[16*map->count2 + 128*i + l3];
4989 if (i == 0) {
4990 return -1;
4991 }
4992 return i;
4993}
4994
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995/* Lookup the character ch in the mapping. If the character
4996 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004997 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004998static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999{
Christian Heimes217cfd12007-12-02 14:31:20 +00005000 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005001 PyObject *x;
5002
5003 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005005 x = PyObject_GetItem(mapping, w);
5006 Py_DECREF(w);
5007 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5009 /* No mapping found means: mapping is undefined. */
5010 PyErr_Clear();
5011 x = Py_None;
5012 Py_INCREF(x);
5013 return x;
5014 } else
5015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005017 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005019 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 long value = PyLong_AS_LONG(x);
5021 if (value < 0 || value > 255) {
5022 PyErr_SetString(PyExc_TypeError,
5023 "character mapping must be in range(256)");
5024 Py_DECREF(x);
5025 return NULL;
5026 }
5027 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005029 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 /* wrong return value */
5033 PyErr_Format(PyExc_TypeError,
5034 "character mapping must return integer, bytes or None, not %.400s",
5035 x->ob_type->tp_name);
5036 Py_DECREF(x);
5037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038 }
5039}
5040
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005041static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005042charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005043{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005044 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5045 /* exponentially overallocate to minimize reallocations */
5046 if (requiredsize < 2*outsize)
5047 requiredsize = 2*outsize;
5048 if (_PyBytes_Resize(outobj, requiredsize))
5049 return -1;
5050 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005051}
5052
Benjamin Peterson14339b62009-01-31 16:36:08 +00005053typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005055}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005057 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 space is available. Return a new reference to the object that
5059 was put in the output buffer, or Py_None, if the mapping was undefined
5060 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005061 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005062static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005063charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005066 PyObject *rep;
5067 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005068 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069
Christian Heimes90aa7642007-12-19 02:45:37 +00005070 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005071 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005073 if (res == -1)
5074 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 if (outsize<requiredsize)
5076 if (charmapencode_resize(outobj, outpos, requiredsize))
5077 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005078 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 outstart[(*outpos)++] = (char)res;
5080 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005081 }
5082
5083 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005086 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 Py_DECREF(rep);
5088 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005089 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 if (PyLong_Check(rep)) {
5091 Py_ssize_t requiredsize = *outpos+1;
5092 if (outsize<requiredsize)
5093 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5094 Py_DECREF(rep);
5095 return enc_EXCEPTION;
5096 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005097 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005099 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 else {
5101 const char *repchars = PyBytes_AS_STRING(rep);
5102 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5103 Py_ssize_t requiredsize = *outpos+repsize;
5104 if (outsize<requiredsize)
5105 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5106 Py_DECREF(rep);
5107 return enc_EXCEPTION;
5108 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005109 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 memcpy(outstart + *outpos, repchars, repsize);
5111 *outpos += repsize;
5112 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005114 Py_DECREF(rep);
5115 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116}
5117
5118/* handle an error in PyUnicode_EncodeCharmap
5119 Return 0 on success, -1 on error */
5120static
5121int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005122 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005124 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005125 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126{
5127 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005128 Py_ssize_t repsize;
5129 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005130 Py_UNICODE *uni2;
5131 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005132 Py_ssize_t collstartpos = *inpos;
5133 Py_ssize_t collendpos = *inpos+1;
5134 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005135 char *encoding = "charmap";
5136 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005137 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005139 /* find all unencodable characters */
5140 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005141 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005142 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 int res = encoding_map_lookup(p[collendpos], mapping);
5144 if (res != -1)
5145 break;
5146 ++collendpos;
5147 continue;
5148 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005149
Benjamin Peterson29060642009-01-31 22:14:21 +00005150 rep = charmapencode_lookup(p[collendpos], mapping);
5151 if (rep==NULL)
5152 return -1;
5153 else if (rep!=Py_None) {
5154 Py_DECREF(rep);
5155 break;
5156 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005157 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005159 }
5160 /* cache callback name lookup
5161 * (if not done yet, i.e. it's the first error) */
5162 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 if ((errors==NULL) || (!strcmp(errors, "strict")))
5164 *known_errorHandler = 1;
5165 else if (!strcmp(errors, "replace"))
5166 *known_errorHandler = 2;
5167 else if (!strcmp(errors, "ignore"))
5168 *known_errorHandler = 3;
5169 else if (!strcmp(errors, "xmlcharrefreplace"))
5170 *known_errorHandler = 4;
5171 else
5172 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005173 }
5174 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005175 case 1: /* strict */
5176 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5177 return -1;
5178 case 2: /* replace */
5179 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 x = charmapencode_output('?', mapping, res, respos);
5181 if (x==enc_EXCEPTION) {
5182 return -1;
5183 }
5184 else if (x==enc_FAILED) {
5185 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5186 return -1;
5187 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005188 }
5189 /* fall through */
5190 case 3: /* ignore */
5191 *inpos = collendpos;
5192 break;
5193 case 4: /* xmlcharrefreplace */
5194 /* generate replacement (temporarily (mis)uses p) */
5195 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 char buffer[2+29+1+1];
5197 char *cp;
5198 sprintf(buffer, "&#%d;", (int)p[collpos]);
5199 for (cp = buffer; *cp; ++cp) {
5200 x = charmapencode_output(*cp, mapping, res, respos);
5201 if (x==enc_EXCEPTION)
5202 return -1;
5203 else if (x==enc_FAILED) {
5204 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5205 return -1;
5206 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005207 }
5208 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005209 *inpos = collendpos;
5210 break;
5211 default:
5212 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 encoding, reason, p, size, exceptionObject,
5214 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005215 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005217 if (PyBytes_Check(repunicode)) {
5218 /* Directly copy bytes result to output. */
5219 Py_ssize_t outsize = PyBytes_Size(*res);
5220 Py_ssize_t requiredsize;
5221 repsize = PyBytes_Size(repunicode);
5222 requiredsize = *respos + repsize;
5223 if (requiredsize > outsize)
5224 /* Make room for all additional bytes. */
5225 if (charmapencode_resize(res, respos, requiredsize)) {
5226 Py_DECREF(repunicode);
5227 return -1;
5228 }
5229 memcpy(PyBytes_AsString(*res) + *respos,
5230 PyBytes_AsString(repunicode), repsize);
5231 *respos += repsize;
5232 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005233 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005234 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005235 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005236 /* generate replacement */
5237 repsize = PyUnicode_GET_SIZE(repunicode);
5238 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 x = charmapencode_output(*uni2, mapping, res, respos);
5240 if (x==enc_EXCEPTION) {
5241 return -1;
5242 }
5243 else if (x==enc_FAILED) {
5244 Py_DECREF(repunicode);
5245 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5246 return -1;
5247 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005248 }
5249 *inpos = newpos;
5250 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005251 }
5252 return 0;
5253}
5254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 Py_ssize_t size,
5257 PyObject *mapping,
5258 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005260 /* output object */
5261 PyObject *res = NULL;
5262 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005263 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005265 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 PyObject *errorHandler = NULL;
5267 PyObject *exc = NULL;
5268 /* the following variable is used for caching string comparisons
5269 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5270 * 3=ignore, 4=xmlcharrefreplace */
5271 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
5273 /* Default to Latin-1 */
5274 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005277 /* allocate enough for a simple encoding without
5278 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005279 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005280 if (res == NULL)
5281 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005282 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005285 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 /* try to encode it */
5287 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5288 if (x==enc_EXCEPTION) /* error */
5289 goto onError;
5290 if (x==enc_FAILED) { /* unencodable character */
5291 if (charmap_encoding_error(p, size, &inpos, mapping,
5292 &exc,
5293 &known_errorHandler, &errorHandler, errors,
5294 &res, &respos)) {
5295 goto onError;
5296 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005297 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 else
5299 /* done with this character => adjust input position */
5300 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005303 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005304 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005305 if (_PyBytes_Resize(&res, respos) < 0)
5306 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005308 Py_XDECREF(exc);
5309 Py_XDECREF(errorHandler);
5310 return res;
5311
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313 Py_XDECREF(res);
5314 Py_XDECREF(exc);
5315 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 return NULL;
5317}
5318
5319PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321{
5322 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 PyErr_BadArgument();
5324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 }
5326 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 PyUnicode_GET_SIZE(unicode),
5328 mapping,
5329 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330}
5331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332/* create or adjust a UnicodeTranslateError */
5333static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 const Py_UNICODE *unicode, Py_ssize_t size,
5335 Py_ssize_t startpos, Py_ssize_t endpos,
5336 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005339 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 }
5342 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5344 goto onError;
5345 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5346 goto onError;
5347 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5348 goto onError;
5349 return;
5350 onError:
5351 Py_DECREF(*exceptionObject);
5352 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 }
5354}
5355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356/* raises a UnicodeTranslateError */
5357static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 const Py_UNICODE *unicode, Py_ssize_t size,
5359 Py_ssize_t startpos, Py_ssize_t endpos,
5360 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361{
5362 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366}
5367
5368/* error handling callback helper:
5369 build arguments, call the callback and check the arguments,
5370 put the result into newpos and return the replacement string, which
5371 has to be freed by the caller */
5372static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 PyObject **errorHandler,
5374 const char *reason,
5375 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5376 Py_ssize_t startpos, Py_ssize_t endpos,
5377 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005379 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005381 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005382 PyObject *restuple;
5383 PyObject *resunicode;
5384
5385 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005387 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 }
5390
5391 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005393 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395
5396 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005400 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005401 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 Py_DECREF(restuple);
5403 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005404 }
5405 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 &resunicode, &i_newpos)) {
5407 Py_DECREF(restuple);
5408 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005409 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005410 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005412 else
5413 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005414 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5416 Py_DECREF(restuple);
5417 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005418 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005419 Py_INCREF(resunicode);
5420 Py_DECREF(restuple);
5421 return resunicode;
5422}
5423
5424/* Lookup the character ch in the mapping and put the result in result,
5425 which must be decrefed by the caller.
5426 Return 0 on success, -1 on error */
5427static
5428int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5429{
Christian Heimes217cfd12007-12-02 14:31:20 +00005430 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431 PyObject *x;
5432
5433 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005434 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005435 x = PyObject_GetItem(mapping, w);
5436 Py_DECREF(w);
5437 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5439 /* No mapping found means: use 1:1 mapping. */
5440 PyErr_Clear();
5441 *result = NULL;
5442 return 0;
5443 } else
5444 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 }
5446 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 *result = x;
5448 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005450 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 long value = PyLong_AS_LONG(x);
5452 long max = PyUnicode_GetMax();
5453 if (value < 0 || value > max) {
5454 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005455 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 Py_DECREF(x);
5457 return -1;
5458 }
5459 *result = x;
5460 return 0;
5461 }
5462 else if (PyUnicode_Check(x)) {
5463 *result = x;
5464 return 0;
5465 }
5466 else {
5467 /* wrong return value */
5468 PyErr_SetString(PyExc_TypeError,
5469 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005470 Py_DECREF(x);
5471 return -1;
5472 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473}
5474/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 if not reallocate and adjust various state variables.
5476 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005477static
Walter Dörwald4894c302003-10-24 14:25:28 +00005478int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005482 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 /* remember old output position */
5484 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5485 /* exponentially overallocate to minimize reallocations */
5486 if (requiredsize < 2 * oldsize)
5487 requiredsize = 2 * oldsize;
5488 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5489 return -1;
5490 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005491 }
5492 return 0;
5493}
5494/* lookup the character, put the result in the output string and adjust
5495 various state variables. Return a new reference to the object that
5496 was put in the output buffer in *result, or Py_None, if the mapping was
5497 undefined (in which case no character was written).
5498 The called must decref result.
5499 Return 0 on success, -1 on error. */
5500static
Walter Dörwald4894c302003-10-24 14:25:28 +00005501int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5503 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504{
Walter Dörwald4894c302003-10-24 14:25:28 +00005505 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 /* not found => default to 1:1 mapping */
5509 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 }
5511 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005513 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 /* no overflow check, because we know that the space is enough */
5515 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 }
5517 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5519 if (repsize==1) {
5520 /* no overflow check, because we know that the space is enough */
5521 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5522 }
5523 else if (repsize!=0) {
5524 /* more than one character */
5525 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5526 (insize - (curinp-startinp)) +
5527 repsize - 1;
5528 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5529 return -1;
5530 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5531 *outp += repsize;
5532 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533 }
5534 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536 return 0;
5537}
5538
5539PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 Py_ssize_t size,
5541 PyObject *mapping,
5542 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 /* output object */
5545 PyObject *res = NULL;
5546 /* pointers to the beginning and end+1 of input */
5547 const Py_UNICODE *startp = p;
5548 const Py_UNICODE *endp = p + size;
5549 /* pointer into the output */
5550 Py_UNICODE *str;
5551 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005553 char *reason = "character maps to <undefined>";
5554 PyObject *errorHandler = NULL;
5555 PyObject *exc = NULL;
5556 /* the following variable is used for caching string comparisons
5557 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5558 * 3=ignore, 4=xmlcharrefreplace */
5559 int known_errorHandler = -1;
5560
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 PyErr_BadArgument();
5563 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565
5566 /* allocate enough for a simple 1:1 translation without
5567 replacements, if we need more, we'll resize */
5568 res = PyUnicode_FromUnicode(NULL, size);
5569 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005573 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005575 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 /* try to encode it */
5577 PyObject *x = NULL;
5578 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5579 Py_XDECREF(x);
5580 goto onError;
5581 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005582 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 if (x!=Py_None) /* it worked => adjust input pointer */
5584 ++p;
5585 else { /* untranslatable character */
5586 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5587 Py_ssize_t repsize;
5588 Py_ssize_t newpos;
5589 Py_UNICODE *uni2;
5590 /* startpos for collecting untranslatable chars */
5591 const Py_UNICODE *collstart = p;
5592 const Py_UNICODE *collend = p+1;
5593 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 /* find all untranslatable characters */
5596 while (collend < endp) {
5597 if (charmaptranslate_lookup(*collend, mapping, &x))
5598 goto onError;
5599 Py_XDECREF(x);
5600 if (x!=Py_None)
5601 break;
5602 ++collend;
5603 }
5604 /* cache callback name lookup
5605 * (if not done yet, i.e. it's the first error) */
5606 if (known_errorHandler==-1) {
5607 if ((errors==NULL) || (!strcmp(errors, "strict")))
5608 known_errorHandler = 1;
5609 else if (!strcmp(errors, "replace"))
5610 known_errorHandler = 2;
5611 else if (!strcmp(errors, "ignore"))
5612 known_errorHandler = 3;
5613 else if (!strcmp(errors, "xmlcharrefreplace"))
5614 known_errorHandler = 4;
5615 else
5616 known_errorHandler = 0;
5617 }
5618 switch (known_errorHandler) {
5619 case 1: /* strict */
5620 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005621 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 case 2: /* replace */
5623 /* No need to check for space, this is a 1:1 replacement */
5624 for (coll = collstart; coll<collend; ++coll)
5625 *str++ = '?';
5626 /* fall through */
5627 case 3: /* ignore */
5628 p = collend;
5629 break;
5630 case 4: /* xmlcharrefreplace */
5631 /* generate replacement (temporarily (mis)uses p) */
5632 for (p = collstart; p < collend; ++p) {
5633 char buffer[2+29+1+1];
5634 char *cp;
5635 sprintf(buffer, "&#%d;", (int)*p);
5636 if (charmaptranslate_makespace(&res, &str,
5637 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5638 goto onError;
5639 for (cp = buffer; *cp; ++cp)
5640 *str++ = *cp;
5641 }
5642 p = collend;
5643 break;
5644 default:
5645 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5646 reason, startp, size, &exc,
5647 collstart-startp, collend-startp, &newpos);
5648 if (repunicode == NULL)
5649 goto onError;
5650 /* generate replacement */
5651 repsize = PyUnicode_GET_SIZE(repunicode);
5652 if (charmaptranslate_makespace(&res, &str,
5653 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5654 Py_DECREF(repunicode);
5655 goto onError;
5656 }
5657 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5658 *str++ = *uni2;
5659 p = startp + newpos;
5660 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005661 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005662 }
5663 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664 /* Resize if we allocated to much */
5665 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005666 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 if (PyUnicode_Resize(&res, respos) < 0)
5668 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005669 }
5670 Py_XDECREF(exc);
5671 Py_XDECREF(errorHandler);
5672 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675 Py_XDECREF(res);
5676 Py_XDECREF(exc);
5677 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 return NULL;
5679}
5680
5681PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 PyObject *mapping,
5683 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684{
5685 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 str = PyUnicode_FromObject(str);
5688 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 PyUnicode_GET_SIZE(str),
5692 mapping,
5693 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 Py_DECREF(str);
5695 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005696
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 Py_XDECREF(str);
5699 return NULL;
5700}
Tim Petersced69f82003-09-16 20:30:58 +00005701
Guido van Rossum9e896b32000-04-05 20:11:21 +00005702/* --- Decimal Encoder ---------------------------------------------------- */
5703
5704int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 Py_ssize_t length,
5706 char *output,
5707 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005708{
5709 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 PyObject *errorHandler = NULL;
5711 PyObject *exc = NULL;
5712 const char *encoding = "decimal";
5713 const char *reason = "invalid decimal Unicode string";
5714 /* the following variable is used for caching string comparisons
5715 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5716 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005717
5718 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 PyErr_BadArgument();
5720 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005721 }
5722
5723 p = s;
5724 end = s + length;
5725 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 register Py_UNICODE ch = *p;
5727 int decimal;
5728 PyObject *repunicode;
5729 Py_ssize_t repsize;
5730 Py_ssize_t newpos;
5731 Py_UNICODE *uni2;
5732 Py_UNICODE *collstart;
5733 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005734
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005736 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 ++p;
5738 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005739 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 decimal = Py_UNICODE_TODECIMAL(ch);
5741 if (decimal >= 0) {
5742 *output++ = '0' + decimal;
5743 ++p;
5744 continue;
5745 }
5746 if (0 < ch && ch < 256) {
5747 *output++ = (char)ch;
5748 ++p;
5749 continue;
5750 }
5751 /* All other characters are considered unencodable */
5752 collstart = p;
5753 collend = p+1;
5754 while (collend < end) {
5755 if ((0 < *collend && *collend < 256) ||
5756 !Py_UNICODE_ISSPACE(*collend) ||
5757 Py_UNICODE_TODECIMAL(*collend))
5758 break;
5759 }
5760 /* cache callback name lookup
5761 * (if not done yet, i.e. it's the first error) */
5762 if (known_errorHandler==-1) {
5763 if ((errors==NULL) || (!strcmp(errors, "strict")))
5764 known_errorHandler = 1;
5765 else if (!strcmp(errors, "replace"))
5766 known_errorHandler = 2;
5767 else if (!strcmp(errors, "ignore"))
5768 known_errorHandler = 3;
5769 else if (!strcmp(errors, "xmlcharrefreplace"))
5770 known_errorHandler = 4;
5771 else
5772 known_errorHandler = 0;
5773 }
5774 switch (known_errorHandler) {
5775 case 1: /* strict */
5776 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5777 goto onError;
5778 case 2: /* replace */
5779 for (p = collstart; p < collend; ++p)
5780 *output++ = '?';
5781 /* fall through */
5782 case 3: /* ignore */
5783 p = collend;
5784 break;
5785 case 4: /* xmlcharrefreplace */
5786 /* generate replacement (temporarily (mis)uses p) */
5787 for (p = collstart; p < collend; ++p)
5788 output += sprintf(output, "&#%d;", (int)*p);
5789 p = collend;
5790 break;
5791 default:
5792 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5793 encoding, reason, s, length, &exc,
5794 collstart-s, collend-s, &newpos);
5795 if (repunicode == NULL)
5796 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005797 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005798 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005799 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5800 Py_DECREF(repunicode);
5801 goto onError;
5802 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 /* generate replacement */
5804 repsize = PyUnicode_GET_SIZE(repunicode);
5805 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5806 Py_UNICODE ch = *uni2;
5807 if (Py_UNICODE_ISSPACE(ch))
5808 *output++ = ' ';
5809 else {
5810 decimal = Py_UNICODE_TODECIMAL(ch);
5811 if (decimal >= 0)
5812 *output++ = '0' + decimal;
5813 else if (0 < ch && ch < 256)
5814 *output++ = (char)ch;
5815 else {
5816 Py_DECREF(repunicode);
5817 raise_encode_exception(&exc, encoding,
5818 s, length, collstart-s, collend-s, reason);
5819 goto onError;
5820 }
5821 }
5822 }
5823 p = s + newpos;
5824 Py_DECREF(repunicode);
5825 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005826 }
5827 /* 0-terminate the output string */
5828 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 Py_XDECREF(exc);
5830 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005831 return 0;
5832
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 Py_XDECREF(exc);
5835 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005836 return -1;
5837}
5838
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839/* --- Helpers ------------------------------------------------------------ */
5840
Eric Smith8c663262007-08-25 02:26:07 +00005841#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005843#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005844/* Include _ParseTupleFinds from find.h */
5845#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005846#include "stringlib/find.h"
5847#include "stringlib/partition.h"
5848
Eric Smith5807c412008-05-11 21:00:57 +00005849#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005850#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005851#include "stringlib/localeutil.h"
5852
Thomas Wouters477c8d52006-05-27 19:21:47 +00005853/* helper macro to fixup start/end slice values */
5854#define FIX_START_END(obj) \
5855 if (start < 0) \
5856 start += (obj)->length; \
5857 if (start < 0) \
5858 start = 0; \
5859 if (end > (obj)->length) \
5860 end = (obj)->length; \
5861 if (end < 0) \
5862 end += (obj)->length; \
5863 if (end < 0) \
5864 end = 0;
5865
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005867 PyObject *substr,
5868 Py_ssize_t start,
5869 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005871 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005872 PyUnicodeObject* str_obj;
5873 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005874
Thomas Wouters477c8d52006-05-27 19:21:47 +00005875 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5876 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005878 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5879 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 Py_DECREF(str_obj);
5881 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 }
Tim Petersced69f82003-09-16 20:30:58 +00005883
Thomas Wouters477c8d52006-05-27 19:21:47 +00005884 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005885
Thomas Wouters477c8d52006-05-27 19:21:47 +00005886 result = stringlib_count(
5887 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5888 );
5889
5890 Py_DECREF(sub_obj);
5891 Py_DECREF(str_obj);
5892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 return result;
5894}
5895
Martin v. Löwis18e16552006-02-15 17:27:45 +00005896Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005897 PyObject *sub,
5898 Py_ssize_t start,
5899 Py_ssize_t end,
5900 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005902 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005903
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005905 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005907 sub = PyUnicode_FromObject(sub);
5908 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 Py_DECREF(str);
5910 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 }
Tim Petersced69f82003-09-16 20:30:58 +00005912
Thomas Wouters477c8d52006-05-27 19:21:47 +00005913 if (direction > 0)
5914 result = stringlib_find_slice(
5915 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5916 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5917 start, end
5918 );
5919 else
5920 result = stringlib_rfind_slice(
5921 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5922 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5923 start, end
5924 );
5925
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005927 Py_DECREF(sub);
5928
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 return result;
5930}
5931
Tim Petersced69f82003-09-16 20:30:58 +00005932static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 PyUnicodeObject *substring,
5935 Py_ssize_t start,
5936 Py_ssize_t end,
5937 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 if (substring->length == 0)
5940 return 1;
5941
Thomas Wouters477c8d52006-05-27 19:21:47 +00005942 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
5944 end -= substring->length;
5945 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
5948 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 if (Py_UNICODE_MATCH(self, end, substring))
5950 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 } else {
5952 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 }
5955
5956 return 0;
5957}
5958
Martin v. Löwis18e16552006-02-15 17:27:45 +00005959Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 PyObject *substr,
5961 Py_ssize_t start,
5962 Py_ssize_t end,
5963 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005965 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005966
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 str = PyUnicode_FromObject(str);
5968 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 substr = PyUnicode_FromObject(substr);
5971 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 Py_DECREF(str);
5973 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 }
Tim Petersced69f82003-09-16 20:30:58 +00005975
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 (PyUnicodeObject *)substr,
5978 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 Py_DECREF(str);
5980 Py_DECREF(substr);
5981 return result;
5982}
5983
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984/* Apply fixfct filter to the Unicode object self and return a
5985 reference to the modified object */
5986
Tim Petersced69f82003-09-16 20:30:58 +00005987static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990{
5991
5992 PyUnicodeObject *u;
5993
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005994 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005997
5998 Py_UNICODE_COPY(u->str, self->str, self->length);
5999
Tim Peters7a29bd52001-09-12 03:03:31 +00006000 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 /* fixfct should return TRUE if it modified the buffer. If
6002 FALSE, return a reference to the original buffer instead
6003 (to save space, not time) */
6004 Py_INCREF(self);
6005 Py_DECREF(u);
6006 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 }
6008 return (PyObject*) u;
6009}
6010
Tim Petersced69f82003-09-16 20:30:58 +00006011static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012int fixupper(PyUnicodeObject *self)
6013{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006014 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 Py_UNICODE *s = self->str;
6016 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006017
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006020
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 ch = Py_UNICODE_TOUPPER(*s);
6022 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 *s = ch;
6025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 s++;
6027 }
6028
6029 return status;
6030}
6031
Tim Petersced69f82003-09-16 20:30:58 +00006032static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033int fixlower(PyUnicodeObject *self)
6034{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006035 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 Py_UNICODE *s = self->str;
6037 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006038
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006041
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 ch = Py_UNICODE_TOLOWER(*s);
6043 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 *s = ch;
6046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 s++;
6048 }
6049
6050 return status;
6051}
6052
Tim Petersced69f82003-09-16 20:30:58 +00006053static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054int fixswapcase(PyUnicodeObject *self)
6055{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006056 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 Py_UNICODE *s = self->str;
6058 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006059
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 while (len-- > 0) {
6061 if (Py_UNICODE_ISUPPER(*s)) {
6062 *s = Py_UNICODE_TOLOWER(*s);
6063 status = 1;
6064 } else if (Py_UNICODE_ISLOWER(*s)) {
6065 *s = Py_UNICODE_TOUPPER(*s);
6066 status = 1;
6067 }
6068 s++;
6069 }
6070
6071 return status;
6072}
6073
Tim Petersced69f82003-09-16 20:30:58 +00006074static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075int fixcapitalize(PyUnicodeObject *self)
6076{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006077 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006078 Py_UNICODE *s = self->str;
6079 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006080
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006081 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006083 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 *s = Py_UNICODE_TOUPPER(*s);
6085 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006087 s++;
6088 while (--len > 0) {
6089 if (Py_UNICODE_ISUPPER(*s)) {
6090 *s = Py_UNICODE_TOLOWER(*s);
6091 status = 1;
6092 }
6093 s++;
6094 }
6095 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096}
6097
6098static
6099int fixtitle(PyUnicodeObject *self)
6100{
6101 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6102 register Py_UNICODE *e;
6103 int previous_is_cased;
6104
6105 /* Shortcut for single character strings */
6106 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6108 if (*p != ch) {
6109 *p = ch;
6110 return 1;
6111 }
6112 else
6113 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 }
Tim Petersced69f82003-09-16 20:30:58 +00006115
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 e = p + PyUnicode_GET_SIZE(self);
6117 previous_is_cased = 0;
6118 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006120
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 if (previous_is_cased)
6122 *p = Py_UNICODE_TOLOWER(ch);
6123 else
6124 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006125
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 if (Py_UNICODE_ISLOWER(ch) ||
6127 Py_UNICODE_ISUPPER(ch) ||
6128 Py_UNICODE_ISTITLE(ch))
6129 previous_is_cased = 1;
6130 else
6131 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 }
6133 return 1;
6134}
6135
Tim Peters8ce9f162004-08-27 01:49:32 +00006136PyObject *
6137PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138{
Skip Montanaro6543b452004-09-16 03:28:13 +00006139 const Py_UNICODE blank = ' ';
6140 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006141 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006142 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006143 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6144 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006145 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6146 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006147 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006148 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
Tim Peters05eba1f2004-08-27 21:32:02 +00006150 fseq = PySequence_Fast(seq, "");
6151 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006152 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006153 }
6154
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006155 /* NOTE: the following code can't call back into Python code,
6156 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006157 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006158
Tim Peters05eba1f2004-08-27 21:32:02 +00006159 seqlen = PySequence_Fast_GET_SIZE(fseq);
6160 /* If empty sequence, return u"". */
6161 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006162 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6163 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006164 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006165 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006166 /* If singleton sequence with an exact Unicode, return that. */
6167 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 item = items[0];
6169 if (PyUnicode_CheckExact(item)) {
6170 Py_INCREF(item);
6171 res = (PyUnicodeObject *)item;
6172 goto Done;
6173 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006174 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006175 else {
6176 /* Set up sep and seplen */
6177 if (separator == NULL) {
6178 sep = &blank;
6179 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006180 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006181 else {
6182 if (!PyUnicode_Check(separator)) {
6183 PyErr_Format(PyExc_TypeError,
6184 "separator: expected str instance,"
6185 " %.80s found",
6186 Py_TYPE(separator)->tp_name);
6187 goto onError;
6188 }
6189 sep = PyUnicode_AS_UNICODE(separator);
6190 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006191 }
6192 }
6193
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006194 /* There are at least two things to join, or else we have a subclass
6195 * of str in the sequence.
6196 * Do a pre-pass to figure out the total amount of space we'll
6197 * need (sz), and see whether all argument are strings.
6198 */
6199 sz = 0;
6200 for (i = 0; i < seqlen; i++) {
6201 const Py_ssize_t old_sz = sz;
6202 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 if (!PyUnicode_Check(item)) {
6204 PyErr_Format(PyExc_TypeError,
6205 "sequence item %zd: expected str instance,"
6206 " %.80s found",
6207 i, Py_TYPE(item)->tp_name);
6208 goto onError;
6209 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006210 sz += PyUnicode_GET_SIZE(item);
6211 if (i != 0)
6212 sz += seplen;
6213 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6214 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006216 goto onError;
6217 }
6218 }
Tim Petersced69f82003-09-16 20:30:58 +00006219
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006220 res = _PyUnicode_New(sz);
6221 if (res == NULL)
6222 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006223
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006224 /* Catenate everything. */
6225 res_p = PyUnicode_AS_UNICODE(res);
6226 for (i = 0; i < seqlen; ++i) {
6227 Py_ssize_t itemlen;
6228 item = items[i];
6229 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 /* Copy item, and maybe the separator. */
6231 if (i) {
6232 Py_UNICODE_COPY(res_p, sep, seplen);
6233 res_p += seplen;
6234 }
6235 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6236 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006237 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006238
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006240 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 return (PyObject *)res;
6242
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006244 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006245 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 return NULL;
6247}
6248
Tim Petersced69f82003-09-16 20:30:58 +00006249static
6250PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 Py_ssize_t left,
6252 Py_ssize_t right,
6253 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254{
6255 PyUnicodeObject *u;
6256
6257 if (left < 0)
6258 left = 0;
6259 if (right < 0)
6260 right = 0;
6261
Tim Peters7a29bd52001-09-12 03:03:31 +00006262 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 Py_INCREF(self);
6264 return self;
6265 }
6266
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006267 if (left > PY_SSIZE_T_MAX - self->length ||
6268 right > PY_SSIZE_T_MAX - (left + self->length)) {
6269 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6270 return NULL;
6271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 u = _PyUnicode_New(left + self->length + right);
6273 if (u) {
6274 if (left)
6275 Py_UNICODE_FILL(u->str, fill, left);
6276 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6277 if (right)
6278 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6279 }
6280
6281 return u;
6282}
6283
Benjamin Peterson29060642009-01-31 22:14:21 +00006284#define SPLIT_APPEND(data, left, right) \
6285 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6286 if (!str) \
6287 goto onError; \
6288 if (PyList_Append(list, str)) { \
6289 Py_DECREF(str); \
6290 goto onError; \
6291 } \
6292 else \
6293 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294
6295static
6296PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 PyObject *list,
6298 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006300 register Py_ssize_t i;
6301 register Py_ssize_t j;
6302 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006304 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305
6306 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006308 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006310 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6312 i++;
6313 if (j < i) {
6314 if (maxcount-- <= 0)
6315 break;
6316 SPLIT_APPEND(buf, j, i);
6317 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6318 i++;
6319 j = i;
6320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 }
6322 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 }
6325 return list;
6326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 Py_DECREF(list);
6329 return NULL;
6330}
6331
6332PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006335 register Py_ssize_t i;
6336 register Py_ssize_t j;
6337 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 PyObject *list;
6339 PyObject *str;
6340 Py_UNICODE *data;
6341
6342 string = PyUnicode_FromObject(string);
6343 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 data = PyUnicode_AS_UNICODE(string);
6346 len = PyUnicode_GET_SIZE(string);
6347
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 list = PyList_New(0);
6349 if (!list)
6350 goto onError;
6351
6352 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006354
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 /* Find a line and append it */
6356 while (i < len && !BLOOM_LINEBREAK(data[i]))
6357 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006360 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 if (i < len) {
6362 if (data[i] == '\r' && i + 1 < len &&
6363 data[i+1] == '\n')
6364 i += 2;
6365 else
6366 i++;
6367 if (keepends)
6368 eol = i;
6369 }
6370 SPLIT_APPEND(data, j, eol);
6371 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 }
6373 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 }
6376
6377 Py_DECREF(string);
6378 return list;
6379
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006381 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 Py_DECREF(string);
6383 return NULL;
6384}
6385
Tim Petersced69f82003-09-16 20:30:58 +00006386static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 PyObject *list,
6389 Py_UNICODE ch,
6390 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006392 register Py_ssize_t i;
6393 register Py_ssize_t j;
6394 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006396 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
6398 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 if (buf[i] == ch) {
6400 if (maxcount-- <= 0)
6401 break;
6402 SPLIT_APPEND(buf, j, i);
6403 i = j = i + 1;
6404 } else
6405 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 }
6407 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 }
6410 return list;
6411
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 Py_DECREF(list);
6414 return NULL;
6415}
6416
Tim Petersced69f82003-09-16 20:30:58 +00006417static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 PyObject *list,
6420 PyUnicodeObject *substring,
6421 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006423 register Py_ssize_t i;
6424 register Py_ssize_t j;
6425 Py_ssize_t len = self->length;
6426 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 PyObject *str;
6428
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006429 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 if (Py_UNICODE_MATCH(self, i, substring)) {
6431 if (maxcount-- <= 0)
6432 break;
6433 SPLIT_APPEND(self->str, j, i);
6434 i = j = i + sublen;
6435 } else
6436 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 }
6438 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 }
6441 return list;
6442
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 Py_DECREF(list);
6445 return NULL;
6446}
6447
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006448static
6449PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 PyObject *list,
6451 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006452{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006453 register Py_ssize_t i;
6454 register Py_ssize_t j;
6455 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006456 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006457 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006458
6459 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006461 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006463 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6465 i--;
6466 if (j > i) {
6467 if (maxcount-- <= 0)
6468 break;
6469 SPLIT_APPEND(buf, i + 1, j + 1);
6470 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6471 i--;
6472 j = i;
6473 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006474 }
6475 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006477 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478 if (PyList_Reverse(list) < 0)
6479 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006480 return list;
6481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006483 Py_DECREF(list);
6484 return NULL;
6485}
6486
Benjamin Peterson14339b62009-01-31 16:36:08 +00006487static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006488PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 PyObject *list,
6490 Py_UNICODE ch,
6491 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006492{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006493 register Py_ssize_t i;
6494 register Py_ssize_t j;
6495 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006496 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006497 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006498
6499 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 if (buf[i] == ch) {
6501 if (maxcount-- <= 0)
6502 break;
6503 SPLIT_APPEND(buf, i + 1, j + 1);
6504 j = i = i - 1;
6505 } else
6506 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006507 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006508 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006510 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006511 if (PyList_Reverse(list) < 0)
6512 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006513 return list;
6514
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006516 Py_DECREF(list);
6517 return NULL;
6518}
6519
Benjamin Peterson14339b62009-01-31 16:36:08 +00006520static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006521PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 PyObject *list,
6523 PyUnicodeObject *substring,
6524 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006525{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006526 register Py_ssize_t i;
6527 register Py_ssize_t j;
6528 Py_ssize_t len = self->length;
6529 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006530 PyObject *str;
6531
6532 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 if (Py_UNICODE_MATCH(self, i, substring)) {
6534 if (maxcount-- <= 0)
6535 break;
6536 SPLIT_APPEND(self->str, i + sublen, j);
6537 j = i;
6538 i -= sublen;
6539 } else
6540 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006541 }
6542 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006544 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545 if (PyList_Reverse(list) < 0)
6546 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006547 return list;
6548
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006550 Py_DECREF(list);
6551 return NULL;
6552}
6553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554#undef SPLIT_APPEND
6555
6556static
6557PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 PyUnicodeObject *substring,
6559 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560{
6561 PyObject *list;
6562
6563 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006564 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565
6566 list = PyList_New(0);
6567 if (!list)
6568 return NULL;
6569
6570 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
6573 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575
6576 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 Py_DECREF(list);
6578 PyErr_SetString(PyExc_ValueError, "empty separator");
6579 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 }
6581 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583}
6584
Tim Petersced69f82003-09-16 20:30:58 +00006585static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006586PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 PyUnicodeObject *substring,
6588 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006589{
6590 PyObject *list;
6591
6592 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006593 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006594
6595 list = PyList_New(0);
6596 if (!list)
6597 return NULL;
6598
6599 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006601
6602 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006604
6605 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 Py_DECREF(list);
6607 PyErr_SetString(PyExc_ValueError, "empty separator");
6608 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006609 }
6610 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006612}
6613
6614static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 PyUnicodeObject *str1,
6617 PyUnicodeObject *str2,
6618 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619{
6620 PyUnicodeObject *u;
6621
6622 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624
Thomas Wouters477c8d52006-05-27 19:21:47 +00006625 if (str1->length == str2->length) {
6626 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006627 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006628 if (str1->length == 1) {
6629 /* replace characters */
6630 Py_UNICODE u1, u2;
6631 if (!findchar(self->str, self->length, str1->str[0]))
6632 goto nothing;
6633 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6634 if (!u)
6635 return NULL;
6636 Py_UNICODE_COPY(u->str, self->str, self->length);
6637 u1 = str1->str[0];
6638 u2 = str2->str[0];
6639 for (i = 0; i < u->length; i++)
6640 if (u->str[i] == u1) {
6641 if (--maxcount < 0)
6642 break;
6643 u->str[i] = u2;
6644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006646 i = fastsearch(
6647 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006649 if (i < 0)
6650 goto nothing;
6651 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6652 if (!u)
6653 return NULL;
6654 Py_UNICODE_COPY(u->str, self->str, self->length);
6655 while (i <= self->length - str1->length)
6656 if (Py_UNICODE_MATCH(self, i, str1)) {
6657 if (--maxcount < 0)
6658 break;
6659 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6660 i += str1->length;
6661 } else
6662 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006665
6666 Py_ssize_t n, i, j, e;
6667 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 Py_UNICODE *p;
6669
6670 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006671 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 if (n > maxcount)
6673 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006674 if (n == 0)
6675 goto nothing;
6676 /* new_size = self->length + n * (str2->length - str1->length)); */
6677 delta = (str2->length - str1->length);
6678 if (delta == 0) {
6679 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006681 product = n * (str2->length - str1->length);
6682 if ((product / (str2->length - str1->length)) != n) {
6683 PyErr_SetString(PyExc_OverflowError,
6684 "replace string is too long");
6685 return NULL;
6686 }
6687 new_size = self->length + product;
6688 if (new_size < 0) {
6689 PyErr_SetString(PyExc_OverflowError,
6690 "replace string is too long");
6691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 }
6693 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006694 u = _PyUnicode_New(new_size);
6695 if (!u)
6696 return NULL;
6697 i = 0;
6698 p = u->str;
6699 e = self->length - str1->length;
6700 if (str1->length > 0) {
6701 while (n-- > 0) {
6702 /* look for next match */
6703 j = i;
6704 while (j <= e) {
6705 if (Py_UNICODE_MATCH(self, j, str1))
6706 break;
6707 j++;
6708 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006710 if (j > e)
6711 break;
6712 /* copy unchanged part [i:j] */
6713 Py_UNICODE_COPY(p, self->str+i, j-i);
6714 p += j - i;
6715 }
6716 /* copy substitution string */
6717 if (str2->length > 0) {
6718 Py_UNICODE_COPY(p, str2->str, str2->length);
6719 p += str2->length;
6720 }
6721 i = j + str1->length;
6722 }
6723 if (i < self->length)
6724 /* copy tail [i:] */
6725 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6726 } else {
6727 /* interleave */
6728 while (n > 0) {
6729 Py_UNICODE_COPY(p, str2->str, str2->length);
6730 p += str2->length;
6731 if (--n <= 0)
6732 break;
6733 *p++ = self->str[i++];
6734 }
6735 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006739
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006741 /* nothing to replace; return original string (when possible) */
6742 if (PyUnicode_CheckExact(self)) {
6743 Py_INCREF(self);
6744 return (PyObject *) self;
6745 }
6746 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747}
6748
6749/* --- Unicode Object Methods --------------------------------------------- */
6750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006751PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753\n\
6754Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006755characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756
6757static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006758unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 return fixup(self, fixtitle);
6761}
6762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006763PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765\n\
6766Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran74ceac22010-07-05 12:04:23 +00006767have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768
6769static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006770unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 return fixup(self, fixcapitalize);
6773}
6774
6775#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006776PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778\n\
6779Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006780normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
6782static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006783unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784{
6785 PyObject *list;
6786 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006787 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 /* Split into words */
6790 list = split(self, NULL, -1);
6791 if (!list)
6792 return NULL;
6793
6794 /* Capitalize each word */
6795 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6796 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 if (item == NULL)
6799 goto onError;
6800 Py_DECREF(PyList_GET_ITEM(list, i));
6801 PyList_SET_ITEM(list, i, item);
6802 }
6803
6804 /* Join the words to form a new string */
6805 item = PyUnicode_Join(NULL, list);
6806
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 Py_DECREF(list);
6809 return (PyObject *)item;
6810}
6811#endif
6812
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006813/* Argument converter. Coerces to a single unicode character */
6814
6815static int
6816convert_uc(PyObject *obj, void *addr)
6817{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006818 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6819 PyObject *uniobj;
6820 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006821
Benjamin Peterson14339b62009-01-31 16:36:08 +00006822 uniobj = PyUnicode_FromObject(obj);
6823 if (uniobj == NULL) {
6824 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006826 return 0;
6827 }
6828 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6829 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006831 Py_DECREF(uniobj);
6832 return 0;
6833 }
6834 unistr = PyUnicode_AS_UNICODE(uniobj);
6835 *fillcharloc = unistr[0];
6836 Py_DECREF(uniobj);
6837 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006838}
6839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006840PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006843Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006844done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845
6846static PyObject *
6847unicode_center(PyUnicodeObject *self, PyObject *args)
6848{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006849 Py_ssize_t marg, left;
6850 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006851 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852
Thomas Woutersde017742006-02-16 19:34:37 +00006853 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 return NULL;
6855
Tim Peters7a29bd52001-09-12 03:03:31 +00006856 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 Py_INCREF(self);
6858 return (PyObject*) self;
6859 }
6860
6861 marg = width - self->length;
6862 left = marg / 2 + (marg & width & 1);
6863
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006864 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865}
6866
Marc-André Lemburge5034372000-08-08 08:04:29 +00006867#if 0
6868
6869/* This code should go into some future Unicode collation support
6870 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006871 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006872
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006873/* speedy UTF-16 code point order comparison */
6874/* gleaned from: */
6875/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6876
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006877static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006878{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006879 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006880 0, 0, 0, 0, 0, 0, 0, 0,
6881 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006882 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006883};
6884
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885static int
6886unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6887{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006888 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006889
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 Py_UNICODE *s1 = str1->str;
6891 Py_UNICODE *s2 = str2->str;
6892
6893 len1 = str1->length;
6894 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006895
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006897 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006898
6899 c1 = *s1++;
6900 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006901
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 if (c1 > (1<<11) * 26)
6903 c1 += utf16Fixup[c1>>11];
6904 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006905 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006906 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006907
6908 if (c1 != c2)
6909 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006910
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006911 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 }
6913
6914 return (len1 < len2) ? -1 : (len1 != len2);
6915}
6916
Marc-André Lemburge5034372000-08-08 08:04:29 +00006917#else
6918
6919static int
6920unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6921{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006922 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006923
6924 Py_UNICODE *s1 = str1->str;
6925 Py_UNICODE *s2 = str2->str;
6926
6927 len1 = str1->length;
6928 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006929
Marc-André Lemburge5034372000-08-08 08:04:29 +00006930 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006931 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006932
Fredrik Lundh45714e92001-06-26 16:39:36 +00006933 c1 = *s1++;
6934 c2 = *s2++;
6935
6936 if (c1 != c2)
6937 return (c1 < c2) ? -1 : 1;
6938
Marc-André Lemburge5034372000-08-08 08:04:29 +00006939 len1--; len2--;
6940 }
6941
6942 return (len1 < len2) ? -1 : (len1 != len2);
6943}
6944
6945#endif
6946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006950 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6951 return unicode_compare((PyUnicodeObject *)left,
6952 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006953 PyErr_Format(PyExc_TypeError,
6954 "Can't compare %.100s and %.100s",
6955 left->ob_type->tp_name,
6956 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957 return -1;
6958}
6959
Martin v. Löwis5b222132007-06-10 09:51:05 +00006960int
6961PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6962{
6963 int i;
6964 Py_UNICODE *id;
6965 assert(PyUnicode_Check(uni));
6966 id = PyUnicode_AS_UNICODE(uni);
6967 /* Compare Unicode string and source character set string */
6968 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 if (id[i] != str[i])
6970 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Petersonbb81c8c2010-01-09 21:54:39 +00006971 /* This check keeps Python strings that end in '\0' from comparing equal
6972 to C strings identical up to that point. */
6973 if (PyUnicode_GET_SIZE(uni) != i)
6974 /* We'll say the Python string is longer. */
6975 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006976 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006978 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006980 return 0;
6981}
6982
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006983
Benjamin Peterson29060642009-01-31 22:14:21 +00006984#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006986
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006987PyObject *PyUnicode_RichCompare(PyObject *left,
6988 PyObject *right,
6989 int op)
6990{
6991 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006992
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006993 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6994 PyObject *v;
6995 if (((PyUnicodeObject *) left)->length !=
6996 ((PyUnicodeObject *) right)->length) {
6997 if (op == Py_EQ) {
6998 Py_INCREF(Py_False);
6999 return Py_False;
7000 }
7001 if (op == Py_NE) {
7002 Py_INCREF(Py_True);
7003 return Py_True;
7004 }
7005 }
7006 if (left == right)
7007 result = 0;
7008 else
7009 result = unicode_compare((PyUnicodeObject *)left,
7010 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007011
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007012 /* Convert the return value to a Boolean */
7013 switch (op) {
7014 case Py_EQ:
7015 v = TEST_COND(result == 0);
7016 break;
7017 case Py_NE:
7018 v = TEST_COND(result != 0);
7019 break;
7020 case Py_LE:
7021 v = TEST_COND(result <= 0);
7022 break;
7023 case Py_GE:
7024 v = TEST_COND(result >= 0);
7025 break;
7026 case Py_LT:
7027 v = TEST_COND(result == -1);
7028 break;
7029 case Py_GT:
7030 v = TEST_COND(result == 1);
7031 break;
7032 default:
7033 PyErr_BadArgument();
7034 return NULL;
7035 }
7036 Py_INCREF(v);
7037 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007038 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007039
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007040 Py_INCREF(Py_NotImplemented);
7041 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007042}
7043
Guido van Rossum403d68b2000-03-13 15:55:09 +00007044int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007046{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007047 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007048 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007049
7050 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007051 sub = PyUnicode_FromObject(element);
7052 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 PyErr_Format(PyExc_TypeError,
7054 "'in <string>' requires string as left operand, not %s",
7055 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007056 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007057 }
7058
Thomas Wouters477c8d52006-05-27 19:21:47 +00007059 str = PyUnicode_FromObject(container);
7060 if (!str) {
7061 Py_DECREF(sub);
7062 return -1;
7063 }
7064
7065 result = stringlib_contains_obj(str, sub);
7066
7067 Py_DECREF(str);
7068 Py_DECREF(sub);
7069
Guido van Rossum403d68b2000-03-13 15:55:09 +00007070 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007071}
7072
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073/* Concat to string or Unicode object giving a new Unicode object. */
7074
7075PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077{
7078 PyUnicodeObject *u = NULL, *v = NULL, *w;
7079
7080 /* Coerce the two arguments */
7081 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7082 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7085 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087
7088 /* Shortcuts */
7089 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 Py_DECREF(v);
7091 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 }
7093 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 Py_DECREF(u);
7095 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 }
7097
7098 /* Concat the two Unicode strings */
7099 w = _PyUnicode_New(u->length + v->length);
7100 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 Py_UNICODE_COPY(w->str, u->str, u->length);
7103 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7104
7105 Py_DECREF(u);
7106 Py_DECREF(v);
7107 return (PyObject *)w;
7108
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 Py_XDECREF(u);
7111 Py_XDECREF(v);
7112 return NULL;
7113}
7114
Walter Dörwald1ab83302007-05-18 17:15:44 +00007115void
7116PyUnicode_Append(PyObject **pleft, PyObject *right)
7117{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007118 PyObject *new;
7119 if (*pleft == NULL)
7120 return;
7121 if (right == NULL || !PyUnicode_Check(*pleft)) {
7122 Py_DECREF(*pleft);
7123 *pleft = NULL;
7124 return;
7125 }
7126 new = PyUnicode_Concat(*pleft, right);
7127 Py_DECREF(*pleft);
7128 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007129}
7130
7131void
7132PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7133{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007134 PyUnicode_Append(pleft, right);
7135 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007136}
7137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007138PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007141Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007142string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007143interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144
7145static PyObject *
7146unicode_count(PyUnicodeObject *self, PyObject *args)
7147{
7148 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007149 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007150 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151 PyObject *result;
7152
Jesus Ceaac451502011-04-20 17:09:23 +02007153 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7154 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007156
Thomas Wouters477c8d52006-05-27 19:21:47 +00007157 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158
Christian Heimes217cfd12007-12-02 14:31:20 +00007159 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007160 stringlib_count(self->str + start, end - start,
7161 substring->str, substring->length)
7162 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163
7164 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007165
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 return result;
7167}
7168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007169PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007172Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007173to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007174handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7176'xmlcharrefreplace' as well as any other name registered with\n\
7177codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178
7179static PyObject *
7180unicode_encode(PyUnicodeObject *self, PyObject *args)
7181{
7182 char *encoding = NULL;
7183 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007184 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007185
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7187 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007188 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007189 if (v == NULL)
7190 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007191 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007192 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007193 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007194 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007195 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007196 Py_DECREF(v);
7197 return NULL;
7198 }
7199 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007200
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007202 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007203}
7204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007205PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207\n\
7208Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007209If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210
7211static PyObject*
7212unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7213{
7214 Py_UNICODE *e;
7215 Py_UNICODE *p;
7216 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007217 Py_UNICODE *qe;
7218 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 PyUnicodeObject *u;
7220 int tabsize = 8;
7221
7222 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224
Thomas Wouters7e474022000-07-16 12:04:32 +00007225 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007226 i = 0; /* chars up to and including most recent \n or \r */
7227 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7228 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 for (p = self->str; p < e; p++)
7230 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 if (tabsize > 0) {
7232 incr = tabsize - (j % tabsize); /* cannot overflow */
7233 if (j > PY_SSIZE_T_MAX - incr)
7234 goto overflow1;
7235 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007236 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 if (j > PY_SSIZE_T_MAX - 1)
7240 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 j++;
7242 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 if (i > PY_SSIZE_T_MAX - j)
7244 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007246 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247 }
7248 }
7249
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007250 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007252
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 /* Second pass: create output string and fill it */
7254 u = _PyUnicode_New(i + j);
7255 if (!u)
7256 return NULL;
7257
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007258 j = 0; /* same as in first pass */
7259 q = u->str; /* next output char */
7260 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261
7262 for (p = self->str; p < e; p++)
7263 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 if (tabsize > 0) {
7265 i = tabsize - (j % tabsize);
7266 j += i;
7267 while (i--) {
7268 if (q >= qe)
7269 goto overflow2;
7270 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007271 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007273 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 else {
7275 if (q >= qe)
7276 goto overflow2;
7277 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007278 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 if (*p == '\n' || *p == '\r')
7280 j = 0;
7281 }
7282
7283 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007284
7285 overflow2:
7286 Py_DECREF(u);
7287 overflow1:
7288 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290}
7291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007292PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294\n\
7295Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007296such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297arguments start and end are interpreted as in slice notation.\n\
7298\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007299Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300
7301static PyObject *
7302unicode_find(PyUnicodeObject *self, PyObject *args)
7303{
Jesus Ceaac451502011-04-20 17:09:23 +02007304 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007305 Py_ssize_t start;
7306 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007307 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308
Jesus Ceaac451502011-04-20 17:09:23 +02007309 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7310 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
Thomas Wouters477c8d52006-05-27 19:21:47 +00007313 result = stringlib_find_slice(
7314 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7315 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7316 start, end
7317 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318
7319 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007320
Christian Heimes217cfd12007-12-02 14:31:20 +00007321 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322}
7323
7324static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007325unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326{
7327 if (index < 0 || index >= self->length) {
7328 PyErr_SetString(PyExc_IndexError, "string index out of range");
7329 return NULL;
7330 }
7331
7332 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7333}
7334
Guido van Rossumc2504932007-09-18 19:42:40 +00007335/* Believe it or not, this produces the same value for ASCII strings
7336 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007338unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339{
Guido van Rossumc2504932007-09-18 19:42:40 +00007340 Py_ssize_t len;
7341 Py_UNICODE *p;
7342 long x;
7343
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007344#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -05007345 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007346#endif
Guido van Rossumc2504932007-09-18 19:42:40 +00007347 if (self->hash != -1)
7348 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007349 len = Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007350 /*
7351 We make the hash of the empty string be 0, rather than using
7352 (prefix ^ suffix), since this slightly obfuscates the hash secret
7353 */
7354 if (len == 0) {
7355 self->hash = 0;
7356 return 0;
7357 }
Guido van Rossumc2504932007-09-18 19:42:40 +00007358 p = self->str;
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007359 x = _Py_HashSecret.prefix;
7360 x ^= *p << 7;
Guido van Rossumc2504932007-09-18 19:42:40 +00007361 while (--len >= 0)
7362 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007363 x ^= Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007364 x ^= _Py_HashSecret.suffix;
Guido van Rossumc2504932007-09-18 19:42:40 +00007365 if (x == -1)
7366 x = -2;
7367 self->hash = x;
7368 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369}
7370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007371PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007374Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375
7376static PyObject *
7377unicode_index(PyUnicodeObject *self, PyObject *args)
7378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007379 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007380 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007381 Py_ssize_t start;
7382 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383
Jesus Ceaac451502011-04-20 17:09:23 +02007384 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7385 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387
Thomas Wouters477c8d52006-05-27 19:21:47 +00007388 result = stringlib_find_slice(
7389 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7390 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7391 start, end
7392 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393
7394 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007395
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 if (result < 0) {
7397 PyErr_SetString(PyExc_ValueError, "substring not found");
7398 return NULL;
7399 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007400
Christian Heimes217cfd12007-12-02 14:31:20 +00007401 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402}
7403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007404PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007407Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007408at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
7410static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007411unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412{
7413 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7414 register const Py_UNICODE *e;
7415 int cased;
7416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 /* Shortcut for single character strings */
7418 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007421 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007422 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007424
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425 e = p + PyUnicode_GET_SIZE(self);
7426 cased = 0;
7427 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007429
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7431 return PyBool_FromLong(0);
7432 else if (!cased && Py_UNICODE_ISLOWER(ch))
7433 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007435 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436}
7437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007438PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007441Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007442at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443
7444static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007445unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446{
7447 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7448 register const Py_UNICODE *e;
7449 int cased;
7450
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 /* Shortcut for single character strings */
7452 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007455 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007456 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007458
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 e = p + PyUnicode_GET_SIZE(self);
7460 cased = 0;
7461 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007463
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7465 return PyBool_FromLong(0);
7466 else if (!cased && Py_UNICODE_ISUPPER(ch))
7467 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007469 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470}
7471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007472PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007475Return True if S is a titlecased string and there is at least one\n\
7476character in S, i.e. upper- and titlecase characters may only\n\
7477follow uncased characters and lowercase characters only cased ones.\n\
7478Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
7480static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007481unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482{
7483 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7484 register const Py_UNICODE *e;
7485 int cased, previous_is_cased;
7486
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 /* Shortcut for single character strings */
7488 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7490 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007492 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007493 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007495
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 e = p + PyUnicode_GET_SIZE(self);
7497 cased = 0;
7498 previous_is_cased = 0;
7499 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007501
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7503 if (previous_is_cased)
7504 return PyBool_FromLong(0);
7505 previous_is_cased = 1;
7506 cased = 1;
7507 }
7508 else if (Py_UNICODE_ISLOWER(ch)) {
7509 if (!previous_is_cased)
7510 return PyBool_FromLong(0);
7511 previous_is_cased = 1;
7512 cased = 1;
7513 }
7514 else
7515 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007517 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518}
7519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007520PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007523Return True if all characters in S are whitespace\n\
7524and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525
7526static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007527unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528{
7529 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7530 register const Py_UNICODE *e;
7531
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 /* Shortcut for single character strings */
7533 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 Py_UNICODE_ISSPACE(*p))
7535 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007537 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007538 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007540
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541 e = p + PyUnicode_GET_SIZE(self);
7542 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 if (!Py_UNICODE_ISSPACE(*p))
7544 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007546 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547}
7548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007549PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007551\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007552Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007553and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007554
7555static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007556unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007557{
7558 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7559 register const Py_UNICODE *e;
7560
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007561 /* Shortcut for single character strings */
7562 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 Py_UNICODE_ISALPHA(*p))
7564 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007565
7566 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007567 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007569
7570 e = p + PyUnicode_GET_SIZE(self);
7571 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 if (!Py_UNICODE_ISALPHA(*p))
7573 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007574 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007575 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007576}
7577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007578PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007580\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007581Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007582and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007583
7584static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007585unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007586{
7587 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7588 register const Py_UNICODE *e;
7589
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007590 /* Shortcut for single character strings */
7591 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 Py_UNICODE_ISALNUM(*p))
7593 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007594
7595 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007596 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007598
7599 e = p + PyUnicode_GET_SIZE(self);
7600 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 if (!Py_UNICODE_ISALNUM(*p))
7602 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007603 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007604 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007605}
7606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007607PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007610Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007611False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612
7613static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007614unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615{
7616 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7617 register const Py_UNICODE *e;
7618
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619 /* Shortcut for single character strings */
7620 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 Py_UNICODE_ISDECIMAL(*p))
7622 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007624 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007625 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007627
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 e = p + PyUnicode_GET_SIZE(self);
7629 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 if (!Py_UNICODE_ISDECIMAL(*p))
7631 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007633 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634}
7635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007636PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007639Return True if all characters in S are digits\n\
7640and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
7642static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007643unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644{
7645 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7646 register const Py_UNICODE *e;
7647
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 /* Shortcut for single character strings */
7649 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 Py_UNICODE_ISDIGIT(*p))
7651 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007653 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007654 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007656
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 e = p + PyUnicode_GET_SIZE(self);
7658 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 if (!Py_UNICODE_ISDIGIT(*p))
7660 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007662 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663}
7664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007665PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007668Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007669False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670
7671static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007672unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673{
7674 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7675 register const Py_UNICODE *e;
7676
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 /* Shortcut for single character strings */
7678 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 Py_UNICODE_ISNUMERIC(*p))
7680 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007682 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007683 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007685
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 e = p + PyUnicode_GET_SIZE(self);
7687 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 if (!Py_UNICODE_ISNUMERIC(*p))
7689 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007691 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692}
7693
Martin v. Löwis47383402007-08-15 07:32:56 +00007694int
7695PyUnicode_IsIdentifier(PyObject *self)
7696{
7697 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7698 register const Py_UNICODE *e;
7699
7700 /* Special case for empty strings */
7701 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007703
7704 /* PEP 3131 says that the first character must be in
7705 XID_Start and subsequent characters in XID_Continue,
7706 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007707 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007708 letters, digits, underscore). However, given the current
7709 definition of XID_Start and XID_Continue, it is sufficient
7710 to check just for these, except that _ must be allowed
7711 as starting an identifier. */
7712 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7713 return 0;
7714
7715 e = p + PyUnicode_GET_SIZE(self);
7716 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 if (!_PyUnicode_IsXidContinue(*p))
7718 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007719 }
7720 return 1;
7721}
7722
7723PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007725\n\
7726Return True if S is a valid identifier according\n\
7727to the language definition.");
7728
7729static PyObject*
7730unicode_isidentifier(PyObject *self)
7731{
7732 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7733}
7734
Georg Brandl559e5d72008-06-11 18:37:52 +00007735PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007737\n\
7738Return True if all characters in S are considered\n\
7739printable in repr() or S is empty, False otherwise.");
7740
7741static PyObject*
7742unicode_isprintable(PyObject *self)
7743{
7744 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7745 register const Py_UNICODE *e;
7746
7747 /* Shortcut for single character strings */
7748 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7749 Py_RETURN_TRUE;
7750 }
7751
7752 e = p + PyUnicode_GET_SIZE(self);
7753 for (; p < e; p++) {
7754 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7755 Py_RETURN_FALSE;
7756 }
7757 }
7758 Py_RETURN_TRUE;
7759}
7760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007761PyDoc_STRVAR(join__doc__,
Georg Brandl628e6f92009-10-27 20:24:45 +00007762 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763\n\
7764Return a string which is the concatenation of the strings in the\n\
Georg Brandl628e6f92009-10-27 20:24:45 +00007765iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766
7767static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007768unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007770 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771}
7772
Martin v. Löwis18e16552006-02-15 17:27:45 +00007773static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774unicode_length(PyUnicodeObject *self)
7775{
7776 return self->length;
7777}
7778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007779PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007782Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007783done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784
7785static PyObject *
7786unicode_ljust(PyUnicodeObject *self, PyObject *args)
7787{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007788 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007789 Py_UNICODE fillchar = ' ';
7790
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007791 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 return NULL;
7793
Tim Peters7a29bd52001-09-12 03:03:31 +00007794 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 Py_INCREF(self);
7796 return (PyObject*) self;
7797 }
7798
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007799 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800}
7801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007802PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007805Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806
7807static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007808unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 return fixup(self, fixlower);
7811}
7812
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007813#define LEFTSTRIP 0
7814#define RIGHTSTRIP 1
7815#define BOTHSTRIP 2
7816
7817/* Arrays indexed by above */
7818static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7819
7820#define STRIPNAME(i) (stripformat[i]+3)
7821
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007822/* externally visible for str.strip(unicode) */
7823PyObject *
7824_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7825{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007826 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7827 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7828 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7829 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7830 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007831
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007833
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834 i = 0;
7835 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7837 i++;
7838 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007839 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007840
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 j = len;
7842 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 do {
7844 j--;
7845 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7846 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007847 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007848
Benjamin Peterson14339b62009-01-31 16:36:08 +00007849 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 Py_INCREF(self);
7851 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007852 }
7853 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007855}
7856
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857
7858static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007859do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007861 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7862 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007863
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864 i = 0;
7865 if (striptype != RIGHTSTRIP) {
7866 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7867 i++;
7868 }
7869 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007870
Benjamin Peterson14339b62009-01-31 16:36:08 +00007871 j = len;
7872 if (striptype != LEFTSTRIP) {
7873 do {
7874 j--;
7875 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7876 j++;
7877 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007878
Benjamin Peterson14339b62009-01-31 16:36:08 +00007879 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7880 Py_INCREF(self);
7881 return (PyObject*)self;
7882 }
7883 else
7884 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885}
7886
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007887
7888static PyObject *
7889do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7890{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007891 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007892
Benjamin Peterson14339b62009-01-31 16:36:08 +00007893 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7894 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007895
Benjamin Peterson14339b62009-01-31 16:36:08 +00007896 if (sep != NULL && sep != Py_None) {
7897 if (PyUnicode_Check(sep))
7898 return _PyUnicode_XStrip(self, striptype, sep);
7899 else {
7900 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 "%s arg must be None or str",
7902 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007903 return NULL;
7904 }
7905 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007906
Benjamin Peterson14339b62009-01-31 16:36:08 +00007907 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007908}
7909
7910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007911PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007913\n\
7914Return a copy of the string S with leading and trailing\n\
7915whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007916If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007917
7918static PyObject *
7919unicode_strip(PyUnicodeObject *self, PyObject *args)
7920{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007921 if (PyTuple_GET_SIZE(args) == 0)
7922 return do_strip(self, BOTHSTRIP); /* Common case */
7923 else
7924 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007925}
7926
7927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007928PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007930\n\
7931Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007932If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007933
7934static PyObject *
7935unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7936{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007937 if (PyTuple_GET_SIZE(args) == 0)
7938 return do_strip(self, LEFTSTRIP); /* Common case */
7939 else
7940 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007941}
7942
7943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007944PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007946\n\
7947Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007948If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007949
7950static PyObject *
7951unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7952{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007953 if (PyTuple_GET_SIZE(args) == 0)
7954 return do_strip(self, RIGHTSTRIP); /* Common case */
7955 else
7956 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007957}
7958
7959
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007961unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962{
7963 PyUnicodeObject *u;
7964 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007965 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007966 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967
Georg Brandl222de0f2009-04-12 12:01:50 +00007968 if (len < 1) {
7969 Py_INCREF(unicode_empty);
7970 return (PyObject *)unicode_empty;
7971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972
Tim Peters7a29bd52001-09-12 03:03:31 +00007973 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 /* no repeat, return original string */
7975 Py_INCREF(str);
7976 return (PyObject*) str;
7977 }
Tim Peters8f422462000-09-09 06:13:41 +00007978
7979 /* ensure # of chars needed doesn't overflow int and # of bytes
7980 * needed doesn't overflow size_t
7981 */
7982 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007983 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007984 PyErr_SetString(PyExc_OverflowError,
7985 "repeated string is too long");
7986 return NULL;
7987 }
7988 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7989 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7990 PyErr_SetString(PyExc_OverflowError,
7991 "repeated string is too long");
7992 return NULL;
7993 }
7994 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 if (!u)
7996 return NULL;
7997
7998 p = u->str;
7999
Georg Brandl222de0f2009-04-12 12:01:50 +00008000 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008001 Py_UNICODE_FILL(p, str->str[0], len);
8002 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008003 Py_ssize_t done = str->length; /* number of characters copied this far */
8004 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008006 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008007 Py_UNICODE_COPY(p+done, p, n);
8008 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 }
8011
8012 return (PyObject*) u;
8013}
8014
8015PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 PyObject *subobj,
8017 PyObject *replobj,
8018 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019{
8020 PyObject *self;
8021 PyObject *str1;
8022 PyObject *str2;
8023 PyObject *result;
8024
8025 self = PyUnicode_FromObject(obj);
8026 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 str1 = PyUnicode_FromObject(subobj);
8029 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 Py_DECREF(self);
8031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 }
8033 str2 = PyUnicode_FromObject(replobj);
8034 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 Py_DECREF(self);
8036 Py_DECREF(str1);
8037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 }
Tim Petersced69f82003-09-16 20:30:58 +00008039 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 (PyUnicodeObject *)str1,
8041 (PyUnicodeObject *)str2,
8042 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 Py_DECREF(self);
8044 Py_DECREF(str1);
8045 Py_DECREF(str2);
8046 return result;
8047}
8048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008049PyDoc_STRVAR(replace__doc__,
Ezio Melotti415f3402010-06-26 18:52:26 +00008050 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051\n\
8052Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008053old replaced by new. If the optional argument count is\n\
8054given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055
8056static PyObject*
8057unicode_replace(PyUnicodeObject *self, PyObject *args)
8058{
8059 PyUnicodeObject *str1;
8060 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008061 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 PyObject *result;
8063
Martin v. Löwis18e16552006-02-15 17:27:45 +00008064 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 return NULL;
8066 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8067 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008070 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 Py_DECREF(str1);
8072 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074
8075 result = replace(self, str1, str2, maxcount);
8076
8077 Py_DECREF(str1);
8078 Py_DECREF(str2);
8079 return result;
8080}
8081
8082static
8083PyObject *unicode_repr(PyObject *unicode)
8084{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008085 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008086 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008087 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8088 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8089
8090 /* XXX(nnorwitz): rather than over-allocating, it would be
8091 better to choose a different scheme. Perhaps scan the
8092 first N-chars of the string and allocate based on that size.
8093 */
8094 /* Initial allocation is based on the longest-possible unichr
8095 escape.
8096
8097 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8098 unichr, so in this case it's the longest unichr escape. In
8099 narrow (UTF-16) builds this is five chars per source unichr
8100 since there are two unichrs in the surrogate pair, so in narrow
8101 (UTF-16) builds it's not the longest unichr escape.
8102
8103 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8104 so in the narrow (UTF-16) build case it's the longest unichr
8105 escape.
8106 */
8107
Walter Dörwald1ab83302007-05-18 17:15:44 +00008108 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008110#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008112#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008114#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008116 if (repr == NULL)
8117 return NULL;
8118
Walter Dörwald1ab83302007-05-18 17:15:44 +00008119 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008120
8121 /* Add quote */
8122 *p++ = (findchar(s, size, '\'') &&
8123 !findchar(s, size, '"')) ? '"' : '\'';
8124 while (size-- > 0) {
8125 Py_UNICODE ch = *s++;
8126
8127 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008128 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008129 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008130 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008131 continue;
8132 }
8133
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008135 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008136 *p++ = '\\';
8137 *p++ = 't';
8138 }
8139 else if (ch == '\n') {
8140 *p++ = '\\';
8141 *p++ = 'n';
8142 }
8143 else if (ch == '\r') {
8144 *p++ = '\\';
8145 *p++ = 'r';
8146 }
8147
8148 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008149 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008150 *p++ = '\\';
8151 *p++ = 'x';
8152 *p++ = hexdigits[(ch >> 4) & 0x000F];
8153 *p++ = hexdigits[ch & 0x000F];
8154 }
8155
Georg Brandl559e5d72008-06-11 18:37:52 +00008156 /* Copy ASCII characters as-is */
8157 else if (ch < 0x7F) {
8158 *p++ = ch;
8159 }
8160
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008162 else {
8163 Py_UCS4 ucs = ch;
8164
8165#ifndef Py_UNICODE_WIDE
8166 Py_UNICODE ch2 = 0;
8167 /* Get code point from surrogate pair */
8168 if (size > 0) {
8169 ch2 = *s;
8170 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008174 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008175 size--;
8176 }
8177 }
8178#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008180 (categories Z* and C* except ASCII space)
8181 */
8182 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8183 /* Map 8-bit characters to '\xhh' */
8184 if (ucs <= 0xff) {
8185 *p++ = '\\';
8186 *p++ = 'x';
8187 *p++ = hexdigits[(ch >> 4) & 0x000F];
8188 *p++ = hexdigits[ch & 0x000F];
8189 }
8190 /* Map 21-bit characters to '\U00xxxxxx' */
8191 else if (ucs >= 0x10000) {
8192 *p++ = '\\';
8193 *p++ = 'U';
8194 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8195 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8196 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8197 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8198 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8199 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8200 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8201 *p++ = hexdigits[ucs & 0x0000000F];
8202 }
8203 /* Map 16-bit characters to '\uxxxx' */
8204 else {
8205 *p++ = '\\';
8206 *p++ = 'u';
8207 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8208 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8209 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8210 *p++ = hexdigits[ucs & 0x000F];
8211 }
8212 }
8213 /* Copy characters as-is */
8214 else {
8215 *p++ = ch;
8216#ifndef Py_UNICODE_WIDE
8217 if (ucs >= 0x10000)
8218 *p++ = ch2;
8219#endif
8220 }
8221 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008222 }
8223 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008224 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008225
8226 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008227 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008228 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229}
8230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008231PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233\n\
8234Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008235such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236arguments start and end are interpreted as in slice notation.\n\
8237\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008238Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239
8240static PyObject *
8241unicode_rfind(PyUnicodeObject *self, PyObject *args)
8242{
Jesus Ceaac451502011-04-20 17:09:23 +02008243 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008244 Py_ssize_t start;
8245 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008246 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247
Jesus Ceaac451502011-04-20 17:09:23 +02008248 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8249 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251
Thomas Wouters477c8d52006-05-27 19:21:47 +00008252 result = stringlib_rfind_slice(
8253 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8254 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8255 start, end
8256 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257
8258 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008259
Christian Heimes217cfd12007-12-02 14:31:20 +00008260 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261}
8262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008263PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008266Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267
8268static PyObject *
8269unicode_rindex(PyUnicodeObject *self, PyObject *args)
8270{
Jesus Ceaac451502011-04-20 17:09:23 +02008271 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008272 Py_ssize_t start;
8273 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008274 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275
Jesus Ceaac451502011-04-20 17:09:23 +02008276 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8277 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279
Thomas Wouters477c8d52006-05-27 19:21:47 +00008280 result = stringlib_rfind_slice(
8281 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8282 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8283 start, end
8284 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285
8286 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008287
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 if (result < 0) {
8289 PyErr_SetString(PyExc_ValueError, "substring not found");
8290 return NULL;
8291 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008292 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293}
8294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008295PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008298Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008299done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300
8301static PyObject *
8302unicode_rjust(PyUnicodeObject *self, PyObject *args)
8303{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008304 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008305 Py_UNICODE fillchar = ' ';
8306
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008307 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 return NULL;
8309
Tim Peters7a29bd52001-09-12 03:03:31 +00008310 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 Py_INCREF(self);
8312 return (PyObject*) self;
8313 }
8314
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008315 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316}
8317
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 PyObject *sep,
8320 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321{
8322 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008323
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 s = PyUnicode_FromObject(s);
8325 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008326 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 if (sep != NULL) {
8328 sep = PyUnicode_FromObject(sep);
8329 if (sep == NULL) {
8330 Py_DECREF(s);
8331 return NULL;
8332 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 }
8334
8335 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8336
8337 Py_DECREF(s);
8338 Py_XDECREF(sep);
8339 return result;
8340}
8341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008342PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344\n\
8345Return a list of the words in S, using sep as the\n\
8346delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008347splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008348whitespace string is a separator and empty strings are\n\
8349removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350
8351static PyObject*
8352unicode_split(PyUnicodeObject *self, PyObject *args)
8353{
8354 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008355 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356
Martin v. Löwis18e16552006-02-15 17:27:45 +00008357 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 return NULL;
8359
8360 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366}
8367
Thomas Wouters477c8d52006-05-27 19:21:47 +00008368PyObject *
8369PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8370{
8371 PyObject* str_obj;
8372 PyObject* sep_obj;
8373 PyObject* out;
8374
8375 str_obj = PyUnicode_FromObject(str_in);
8376 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008378 sep_obj = PyUnicode_FromObject(sep_in);
8379 if (!sep_obj) {
8380 Py_DECREF(str_obj);
8381 return NULL;
8382 }
8383
8384 out = stringlib_partition(
8385 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8386 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8387 );
8388
8389 Py_DECREF(sep_obj);
8390 Py_DECREF(str_obj);
8391
8392 return out;
8393}
8394
8395
8396PyObject *
8397PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8398{
8399 PyObject* str_obj;
8400 PyObject* sep_obj;
8401 PyObject* out;
8402
8403 str_obj = PyUnicode_FromObject(str_in);
8404 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008406 sep_obj = PyUnicode_FromObject(sep_in);
8407 if (!sep_obj) {
8408 Py_DECREF(str_obj);
8409 return NULL;
8410 }
8411
8412 out = stringlib_rpartition(
8413 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8414 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8415 );
8416
8417 Py_DECREF(sep_obj);
8418 Py_DECREF(str_obj);
8419
8420 return out;
8421}
8422
8423PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008425\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008426Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008427the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008428found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008429
8430static PyObject*
8431unicode_partition(PyUnicodeObject *self, PyObject *separator)
8432{
8433 return PyUnicode_Partition((PyObject *)self, separator);
8434}
8435
8436PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti4c81fbb2010-01-25 12:02:24 +00008437 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008438\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008439Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008440the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008441separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008442
8443static PyObject*
8444unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8445{
8446 return PyUnicode_RPartition((PyObject *)self, separator);
8447}
8448
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008449PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 PyObject *sep,
8451 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008452{
8453 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008455 s = PyUnicode_FromObject(s);
8456 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 if (sep != NULL) {
8459 sep = PyUnicode_FromObject(sep);
8460 if (sep == NULL) {
8461 Py_DECREF(s);
8462 return NULL;
8463 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008464 }
8465
8466 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8467
8468 Py_DECREF(s);
8469 Py_XDECREF(sep);
8470 return result;
8471}
8472
8473PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008475\n\
8476Return a list of the words in S, using sep as the\n\
8477delimiter string, starting at the end of the string and\n\
8478working to the front. If maxsplit is given, at most maxsplit\n\
8479splits are done. If sep is not specified, any whitespace string\n\
8480is a separator.");
8481
8482static PyObject*
8483unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8484{
8485 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008486 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008487
Martin v. Löwis18e16552006-02-15 17:27:45 +00008488 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008489 return NULL;
8490
8491 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008493 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008495 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008497}
8498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008499PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501\n\
8502Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008503Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008504is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505
8506static PyObject*
8507unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8508{
Guido van Rossum86662912000-04-11 15:38:46 +00008509 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510
Guido van Rossum86662912000-04-11 15:38:46 +00008511 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 return NULL;
8513
Guido van Rossum86662912000-04-11 15:38:46 +00008514 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515}
8516
8517static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008518PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519{
Walter Dörwald346737f2007-05-31 10:44:43 +00008520 if (PyUnicode_CheckExact(self)) {
8521 Py_INCREF(self);
8522 return self;
8523 } else
8524 /* Subtype -- return genuine unicode string with the same value. */
8525 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8526 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527}
8528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008529PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531\n\
8532Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008533and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534
8535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008536unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 return fixup(self, fixswapcase);
8539}
8540
Georg Brandlceee0772007-11-27 23:48:05 +00008541PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008543\n\
8544Return a translation table usable for str.translate().\n\
8545If there is only one argument, it must be a dictionary mapping Unicode\n\
8546ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008547Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008548If there are two arguments, they must be strings of equal length, and\n\
8549in the resulting dictionary, each character in x will be mapped to the\n\
8550character at the same position in y. If there is a third argument, it\n\
8551must be a string, whose characters will be mapped to None in the result.");
8552
8553static PyObject*
8554unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8555{
8556 PyObject *x, *y = NULL, *z = NULL;
8557 PyObject *new = NULL, *key, *value;
8558 Py_ssize_t i = 0;
8559 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008560
Georg Brandlceee0772007-11-27 23:48:05 +00008561 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8562 return NULL;
8563 new = PyDict_New();
8564 if (!new)
8565 return NULL;
8566 if (y != NULL) {
8567 /* x must be a string too, of equal length */
8568 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8569 if (!PyUnicode_Check(x)) {
8570 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8571 "be a string if there is a second argument");
8572 goto err;
8573 }
8574 if (PyUnicode_GET_SIZE(x) != ylen) {
8575 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8576 "arguments must have equal length");
8577 goto err;
8578 }
8579 /* create entries for translating chars in x to those in y */
8580 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008581 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8582 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008583 if (!key || !value)
8584 goto err;
8585 res = PyDict_SetItem(new, key, value);
8586 Py_DECREF(key);
8587 Py_DECREF(value);
8588 if (res < 0)
8589 goto err;
8590 }
8591 /* create entries for deleting chars in z */
8592 if (z != NULL) {
8593 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008594 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008595 if (!key)
8596 goto err;
8597 res = PyDict_SetItem(new, key, Py_None);
8598 Py_DECREF(key);
8599 if (res < 0)
8600 goto err;
8601 }
8602 }
8603 } else {
8604 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008605 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008606 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8607 "to maketrans it must be a dict");
8608 goto err;
8609 }
8610 /* copy entries into the new dict, converting string keys to int keys */
8611 while (PyDict_Next(x, &i, &key, &value)) {
8612 if (PyUnicode_Check(key)) {
8613 /* convert string keys to integer keys */
8614 PyObject *newkey;
8615 if (PyUnicode_GET_SIZE(key) != 1) {
8616 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8617 "table must be of length 1");
8618 goto err;
8619 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008620 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008621 if (!newkey)
8622 goto err;
8623 res = PyDict_SetItem(new, newkey, value);
8624 Py_DECREF(newkey);
8625 if (res < 0)
8626 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008627 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008628 /* just keep integer keys */
8629 if (PyDict_SetItem(new, key, value) < 0)
8630 goto err;
8631 } else {
8632 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8633 "be strings or integers");
8634 goto err;
8635 }
8636 }
8637 }
8638 return new;
8639 err:
8640 Py_DECREF(new);
8641 return NULL;
8642}
8643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008644PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646\n\
8647Return a copy of the string S, where all characters have been mapped\n\
8648through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008649Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008650Unmapped characters are left untouched. Characters mapped to None\n\
8651are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652
8653static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008654unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655{
Georg Brandlceee0772007-11-27 23:48:05 +00008656 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657}
8658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008659PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008662Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663
8664static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008665unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667 return fixup(self, fixupper);
8668}
8669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008670PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008673Pad a numeric string S with zeros on the left, to fill a field\n\
8674of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675
8676static PyObject *
8677unicode_zfill(PyUnicodeObject *self, PyObject *args)
8678{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008679 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 PyUnicodeObject *u;
8681
Martin v. Löwis18e16552006-02-15 17:27:45 +00008682 Py_ssize_t width;
8683 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 return NULL;
8685
8686 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008687 if (PyUnicode_CheckExact(self)) {
8688 Py_INCREF(self);
8689 return (PyObject*) self;
8690 }
8691 else
8692 return PyUnicode_FromUnicode(
8693 PyUnicode_AS_UNICODE(self),
8694 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 }
8697
8698 fill = width - self->length;
8699
8700 u = pad(self, fill, 0, '0');
8701
Walter Dörwald068325e2002-04-15 13:36:47 +00008702 if (u == NULL)
8703 return NULL;
8704
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 if (u->str[fill] == '+' || u->str[fill] == '-') {
8706 /* move sign to beginning of string */
8707 u->str[0] = u->str[fill];
8708 u->str[fill] = '0';
8709 }
8710
8711 return (PyObject*) u;
8712}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713
8714#if 0
8715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008716unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717{
Christian Heimes2202f872008-02-06 14:31:34 +00008718 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719}
8720#endif
8721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008722PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008725Return True if S starts with the specified prefix, False otherwise.\n\
8726With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008727With optional end, stop comparing S at that position.\n\
8728prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729
8730static PyObject *
8731unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008734 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008736 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008737 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008738 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739
Jesus Ceaac451502011-04-20 17:09:23 +02008740 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008742 if (PyTuple_Check(subobj)) {
8743 Py_ssize_t i;
8744 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8745 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008747 if (substring == NULL)
8748 return NULL;
8749 result = tailmatch(self, substring, start, end, -1);
8750 Py_DECREF(substring);
8751 if (result) {
8752 Py_RETURN_TRUE;
8753 }
8754 }
8755 /* nothing matched */
8756 Py_RETURN_FALSE;
8757 }
8758 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03008759 if (substring == NULL) {
8760 if (PyErr_ExceptionMatches(PyExc_TypeError))
8761 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
8762 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03008764 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008765 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008767 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768}
8769
8770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008771PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008774Return True if S ends with the specified suffix, False otherwise.\n\
8775With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008776With optional end, stop comparing S at that position.\n\
8777suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778
8779static PyObject *
8780unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008783 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008785 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008786 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008787 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788
Jesus Ceaac451502011-04-20 17:09:23 +02008789 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008791 if (PyTuple_Check(subobj)) {
8792 Py_ssize_t i;
8793 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8794 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008796 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008798 result = tailmatch(self, substring, start, end, +1);
8799 Py_DECREF(substring);
8800 if (result) {
8801 Py_RETURN_TRUE;
8802 }
8803 }
8804 Py_RETURN_FALSE;
8805 }
8806 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03008807 if (substring == NULL) {
8808 if (PyErr_ExceptionMatches(PyExc_TypeError))
8809 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
8810 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03008812 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008813 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008815 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816}
8817
Eric Smith8c663262007-08-25 02:26:07 +00008818#include "stringlib/string_format.h"
8819
8820PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008822\n\
Eric Smith16562f42010-11-06 19:29:45 +00008823Return a formatted version of S, using substitutions from args and kwargs.\n\
8824The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00008825
Eric Smith4a7d76d2008-05-30 18:10:19 +00008826static PyObject *
8827unicode__format__(PyObject* self, PyObject* args)
8828{
8829 PyObject *format_spec;
8830
8831 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8832 return NULL;
8833
8834 return _PyUnicode_FormatAdvanced(self,
8835 PyUnicode_AS_UNICODE(format_spec),
8836 PyUnicode_GET_SIZE(format_spec));
8837}
8838
Eric Smith8c663262007-08-25 02:26:07 +00008839PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008841\n\
Eric Smith16562f42010-11-06 19:29:45 +00008842Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00008843
8844static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008845unicode__sizeof__(PyUnicodeObject *v)
8846{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008847 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8848 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008849}
8850
8851PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008853
8854static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008855unicode_getnewargs(PyUnicodeObject *v)
8856{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008857 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008858}
8859
8860
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861static PyMethodDef unicode_methods[] = {
8862
8863 /* Order is according to common usage: often used methods should
8864 appear first, since lookup is done sequentially. */
8865
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008866 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8867 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8868 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008869 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008870 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8871 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8872 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8873 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8874 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8875 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8876 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008877 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008878 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8879 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8880 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008881 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008882 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8883 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8884 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008885 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008886 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008887 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008888 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008889 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8890 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8891 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8892 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8893 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8894 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8895 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8896 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8897 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8898 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8899 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8900 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8901 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8902 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008903 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008904 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008905 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008906 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008907 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008908 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8909 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008910 {"maketrans", (PyCFunction) unicode_maketrans,
8911 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008912 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008913#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008914 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915#endif
8916
8917#if 0
8918 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008919 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920#endif
8921
Benjamin Peterson14339b62009-01-31 16:36:08 +00008922 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 {NULL, NULL}
8924};
8925
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008926static PyObject *
8927unicode_mod(PyObject *v, PyObject *w)
8928{
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 if (!PyUnicode_Check(v)) {
8930 Py_INCREF(Py_NotImplemented);
8931 return Py_NotImplemented;
8932 }
8933 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008934}
8935
8936static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008937 0, /*nb_add*/
8938 0, /*nb_subtract*/
8939 0, /*nb_multiply*/
8940 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008941};
8942
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008944 (lenfunc) unicode_length, /* sq_length */
8945 PyUnicode_Concat, /* sq_concat */
8946 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8947 (ssizeargfunc) unicode_getitem, /* sq_item */
8948 0, /* sq_slice */
8949 0, /* sq_ass_item */
8950 0, /* sq_ass_slice */
8951 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952};
8953
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008954static PyObject*
8955unicode_subscript(PyUnicodeObject* self, PyObject* item)
8956{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008957 if (PyIndex_Check(item)) {
8958 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008959 if (i == -1 && PyErr_Occurred())
8960 return NULL;
8961 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008962 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008963 return unicode_getitem(self, i);
8964 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008965 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008966 Py_UNICODE* source_buf;
8967 Py_UNICODE* result_buf;
8968 PyObject* result;
8969
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008970 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008972 return NULL;
8973 }
8974
8975 if (slicelength <= 0) {
8976 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008977 } else if (start == 0 && step == 1 && slicelength == self->length &&
8978 PyUnicode_CheckExact(self)) {
8979 Py_INCREF(self);
8980 return (PyObject *)self;
8981 } else if (step == 1) {
8982 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008983 } else {
8984 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008985 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8986 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008987
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 if (result_buf == NULL)
8989 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008990
8991 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8992 result_buf[i] = source_buf[cur];
8993 }
Tim Petersced69f82003-09-16 20:30:58 +00008994
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008995 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008996 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008997 return result;
8998 }
8999 } else {
9000 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9001 return NULL;
9002 }
9003}
9004
9005static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009006 (lenfunc)unicode_length, /* mp_length */
9007 (binaryfunc)unicode_subscript, /* mp_subscript */
9008 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009009};
9010
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012/* Helpers for PyUnicode_Format() */
9013
9014static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009015getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009017 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 (*p_argidx)++;
9020 if (arglen < 0)
9021 return args;
9022 else
9023 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024 }
9025 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 return NULL;
9028}
9029
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009030/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009032static PyObject *
9033formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009035 char *p;
9036 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009038
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 x = PyFloat_AsDouble(v);
9040 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009041 return NULL;
9042
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009045
Eric Smith0923d1d2009-04-16 20:16:10 +00009046 p = PyOS_double_to_string(x, type, prec,
9047 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009048 if (p == NULL)
9049 return NULL;
9050 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009051 PyMem_Free(p);
9052 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053}
9054
Tim Peters38fd5b62000-09-21 05:43:11 +00009055static PyObject*
9056formatlong(PyObject *val, int flags, int prec, int type)
9057{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009058 char *buf;
9059 int len;
9060 PyObject *str; /* temporary string object. */
9061 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009062
Benjamin Peterson14339b62009-01-31 16:36:08 +00009063 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9064 if (!str)
9065 return NULL;
9066 result = PyUnicode_FromStringAndSize(buf, len);
9067 Py_DECREF(str);
9068 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009069}
9070
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071static int
9072formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009073 size_t buflen,
9074 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009076 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009077 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 if (PyUnicode_GET_SIZE(v) == 1) {
9079 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9080 buf[1] = '\0';
9081 return 1;
9082 }
9083#ifndef Py_UNICODE_WIDE
9084 if (PyUnicode_GET_SIZE(v) == 2) {
9085 /* Decode a valid surrogate pair */
9086 int c0 = PyUnicode_AS_UNICODE(v)[0];
9087 int c1 = PyUnicode_AS_UNICODE(v)[1];
9088 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9089 0xDC00 <= c1 && c1 <= 0xDFFF) {
9090 buf[0] = c0;
9091 buf[1] = c1;
9092 buf[2] = '\0';
9093 return 2;
9094 }
9095 }
9096#endif
9097 goto onError;
9098 }
9099 else {
9100 /* Integer input truncated to a character */
9101 long x;
9102 x = PyLong_AsLong(v);
9103 if (x == -1 && PyErr_Occurred())
9104 goto onError;
9105
9106 if (x < 0 || x > 0x10ffff) {
9107 PyErr_SetString(PyExc_OverflowError,
9108 "%c arg not in range(0x110000)");
9109 return -1;
9110 }
9111
9112#ifndef Py_UNICODE_WIDE
9113 if (x > 0xffff) {
9114 x -= 0x10000;
9115 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9116 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9117 return 2;
9118 }
9119#endif
9120 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009121 buf[1] = '\0';
9122 return 1;
9123 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009124
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009126 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009128 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129}
9130
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009131/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009132 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009133*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009134#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009135
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138{
9139 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009140 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 int args_owned = 0;
9142 PyUnicodeObject *result = NULL;
9143 PyObject *dict = NULL;
9144 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009145
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 PyErr_BadInternalCall();
9148 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149 }
9150 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009151 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153 fmt = PyUnicode_AS_UNICODE(uformat);
9154 fmtcnt = PyUnicode_GET_SIZE(uformat);
9155
9156 reslen = rescnt = fmtcnt + 100;
9157 result = _PyUnicode_New(reslen);
9158 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160 res = PyUnicode_AS_UNICODE(result);
9161
9162 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 arglen = PyTuple_Size(args);
9164 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165 }
9166 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 arglen = -1;
9168 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009170 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009171 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173
9174 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 if (*fmt != '%') {
9176 if (--rescnt < 0) {
9177 rescnt = fmtcnt + 100;
9178 reslen += rescnt;
9179 if (_PyUnicode_Resize(&result, reslen) < 0)
9180 goto onError;
9181 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9182 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009183 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009185 }
9186 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009187 /* Got a format specifier */
9188 int flags = 0;
9189 Py_ssize_t width = -1;
9190 int prec = -1;
9191 Py_UNICODE c = '\0';
9192 Py_UNICODE fill;
9193 int isnumok;
9194 PyObject *v = NULL;
9195 PyObject *temp = NULL;
9196 Py_UNICODE *pbuf;
9197 Py_UNICODE sign;
9198 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009199 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 fmt++;
9202 if (*fmt == '(') {
9203 Py_UNICODE *keystart;
9204 Py_ssize_t keylen;
9205 PyObject *key;
9206 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009207
Benjamin Peterson29060642009-01-31 22:14:21 +00009208 if (dict == NULL) {
9209 PyErr_SetString(PyExc_TypeError,
9210 "format requires a mapping");
9211 goto onError;
9212 }
9213 ++fmt;
9214 --fmtcnt;
9215 keystart = fmt;
9216 /* Skip over balanced parentheses */
9217 while (pcount > 0 && --fmtcnt >= 0) {
9218 if (*fmt == ')')
9219 --pcount;
9220 else if (*fmt == '(')
9221 ++pcount;
9222 fmt++;
9223 }
9224 keylen = fmt - keystart - 1;
9225 if (fmtcnt < 0 || pcount > 0) {
9226 PyErr_SetString(PyExc_ValueError,
9227 "incomplete format key");
9228 goto onError;
9229 }
9230#if 0
9231 /* keys are converted to strings using UTF-8 and
9232 then looked up since Python uses strings to hold
9233 variables names etc. in its namespaces and we
9234 wouldn't want to break common idioms. */
9235 key = PyUnicode_EncodeUTF8(keystart,
9236 keylen,
9237 NULL);
9238#else
9239 key = PyUnicode_FromUnicode(keystart, keylen);
9240#endif
9241 if (key == NULL)
9242 goto onError;
9243 if (args_owned) {
9244 Py_DECREF(args);
9245 args_owned = 0;
9246 }
9247 args = PyObject_GetItem(dict, key);
9248 Py_DECREF(key);
9249 if (args == NULL) {
9250 goto onError;
9251 }
9252 args_owned = 1;
9253 arglen = -1;
9254 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009255 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009256 while (--fmtcnt >= 0) {
9257 switch (c = *fmt++) {
9258 case '-': flags |= F_LJUST; continue;
9259 case '+': flags |= F_SIGN; continue;
9260 case ' ': flags |= F_BLANK; continue;
9261 case '#': flags |= F_ALT; continue;
9262 case '0': flags |= F_ZERO; continue;
9263 }
9264 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009265 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009266 if (c == '*') {
9267 v = getnextarg(args, arglen, &argidx);
9268 if (v == NULL)
9269 goto onError;
9270 if (!PyLong_Check(v)) {
9271 PyErr_SetString(PyExc_TypeError,
9272 "* wants int");
9273 goto onError;
9274 }
9275 width = PyLong_AsLong(v);
9276 if (width == -1 && PyErr_Occurred())
9277 goto onError;
9278 if (width < 0) {
9279 flags |= F_LJUST;
9280 width = -width;
9281 }
9282 if (--fmtcnt >= 0)
9283 c = *fmt++;
9284 }
9285 else if (c >= '0' && c <= '9') {
9286 width = c - '0';
9287 while (--fmtcnt >= 0) {
9288 c = *fmt++;
9289 if (c < '0' || c > '9')
9290 break;
9291 if ((width*10) / 10 != width) {
9292 PyErr_SetString(PyExc_ValueError,
9293 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009294 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 }
9296 width = width*10 + (c - '0');
9297 }
9298 }
9299 if (c == '.') {
9300 prec = 0;
9301 if (--fmtcnt >= 0)
9302 c = *fmt++;
9303 if (c == '*') {
9304 v = getnextarg(args, arglen, &argidx);
9305 if (v == NULL)
9306 goto onError;
9307 if (!PyLong_Check(v)) {
9308 PyErr_SetString(PyExc_TypeError,
9309 "* wants int");
9310 goto onError;
9311 }
9312 prec = PyLong_AsLong(v);
9313 if (prec == -1 && PyErr_Occurred())
9314 goto onError;
9315 if (prec < 0)
9316 prec = 0;
9317 if (--fmtcnt >= 0)
9318 c = *fmt++;
9319 }
9320 else if (c >= '0' && c <= '9') {
9321 prec = c - '0';
9322 while (--fmtcnt >= 0) {
Stefan Krahaebd6f42010-07-19 18:01:13 +00009323 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009324 if (c < '0' || c > '9')
9325 break;
9326 if ((prec*10) / 10 != prec) {
9327 PyErr_SetString(PyExc_ValueError,
9328 "prec too big");
9329 goto onError;
9330 }
9331 prec = prec*10 + (c - '0');
9332 }
9333 }
9334 } /* prec */
9335 if (fmtcnt >= 0) {
9336 if (c == 'h' || c == 'l' || c == 'L') {
9337 if (--fmtcnt >= 0)
9338 c = *fmt++;
9339 }
9340 }
9341 if (fmtcnt < 0) {
9342 PyErr_SetString(PyExc_ValueError,
9343 "incomplete format");
9344 goto onError;
9345 }
9346 if (c != '%') {
9347 v = getnextarg(args, arglen, &argidx);
9348 if (v == NULL)
9349 goto onError;
9350 }
9351 sign = 0;
9352 fill = ' ';
9353 switch (c) {
9354
9355 case '%':
9356 pbuf = formatbuf;
9357 /* presume that buffer length is at least 1 */
9358 pbuf[0] = '%';
9359 len = 1;
9360 break;
9361
9362 case 's':
9363 case 'r':
9364 case 'a':
Victor Stinnerabdb21a2010-03-22 12:53:14 +00009365 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009366 temp = v;
9367 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009368 }
9369 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009370 if (c == 's')
9371 temp = PyObject_Str(v);
9372 else if (c == 'r')
9373 temp = PyObject_Repr(v);
9374 else
9375 temp = PyObject_ASCII(v);
9376 if (temp == NULL)
9377 goto onError;
9378 if (PyUnicode_Check(temp))
9379 /* nothing to do */;
9380 else {
9381 Py_DECREF(temp);
9382 PyErr_SetString(PyExc_TypeError,
9383 "%s argument has non-string str()");
9384 goto onError;
9385 }
9386 }
9387 pbuf = PyUnicode_AS_UNICODE(temp);
9388 len = PyUnicode_GET_SIZE(temp);
9389 if (prec >= 0 && len > prec)
9390 len = prec;
9391 break;
9392
9393 case 'i':
9394 case 'd':
9395 case 'u':
9396 case 'o':
9397 case 'x':
9398 case 'X':
9399 if (c == 'i')
9400 c = 'd';
9401 isnumok = 0;
9402 if (PyNumber_Check(v)) {
9403 PyObject *iobj=NULL;
9404
9405 if (PyLong_Check(v)) {
9406 iobj = v;
9407 Py_INCREF(iobj);
9408 }
9409 else {
9410 iobj = PyNumber_Long(v);
9411 }
9412 if (iobj!=NULL) {
9413 if (PyLong_Check(iobj)) {
9414 isnumok = 1;
9415 temp = formatlong(iobj, flags, prec, c);
9416 Py_DECREF(iobj);
9417 if (!temp)
9418 goto onError;
9419 pbuf = PyUnicode_AS_UNICODE(temp);
9420 len = PyUnicode_GET_SIZE(temp);
9421 sign = 1;
9422 }
9423 else {
9424 Py_DECREF(iobj);
9425 }
9426 }
9427 }
9428 if (!isnumok) {
9429 PyErr_Format(PyExc_TypeError,
9430 "%%%c format: a number is required, "
9431 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9432 goto onError;
9433 }
9434 if (flags & F_ZERO)
9435 fill = '0';
9436 break;
9437
9438 case 'e':
9439 case 'E':
9440 case 'f':
9441 case 'F':
9442 case 'g':
9443 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009444 temp = formatfloat(v, flags, prec, c);
9445 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009446 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009447 pbuf = PyUnicode_AS_UNICODE(temp);
9448 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009449 sign = 1;
9450 if (flags & F_ZERO)
9451 fill = '0';
9452 break;
9453
9454 case 'c':
9455 pbuf = formatbuf;
9456 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9457 if (len < 0)
9458 goto onError;
9459 break;
9460
9461 default:
9462 PyErr_Format(PyExc_ValueError,
9463 "unsupported format character '%c' (0x%x) "
9464 "at index %zd",
9465 (31<=c && c<=126) ? (char)c : '?',
9466 (int)c,
9467 (Py_ssize_t)(fmt - 1 -
9468 PyUnicode_AS_UNICODE(uformat)));
9469 goto onError;
9470 }
9471 if (sign) {
9472 if (*pbuf == '-' || *pbuf == '+') {
9473 sign = *pbuf++;
9474 len--;
9475 }
9476 else if (flags & F_SIGN)
9477 sign = '+';
9478 else if (flags & F_BLANK)
9479 sign = ' ';
9480 else
9481 sign = 0;
9482 }
9483 if (width < len)
9484 width = len;
9485 if (rescnt - (sign != 0) < width) {
9486 reslen -= rescnt;
9487 rescnt = width + fmtcnt + 100;
9488 reslen += rescnt;
9489 if (reslen < 0) {
9490 Py_XDECREF(temp);
9491 PyErr_NoMemory();
9492 goto onError;
9493 }
9494 if (_PyUnicode_Resize(&result, reslen) < 0) {
9495 Py_XDECREF(temp);
9496 goto onError;
9497 }
9498 res = PyUnicode_AS_UNICODE(result)
9499 + reslen - rescnt;
9500 }
9501 if (sign) {
9502 if (fill != ' ')
9503 *res++ = sign;
9504 rescnt--;
9505 if (width > len)
9506 width--;
9507 }
9508 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9509 assert(pbuf[0] == '0');
9510 assert(pbuf[1] == c);
9511 if (fill != ' ') {
9512 *res++ = *pbuf++;
9513 *res++ = *pbuf++;
9514 }
9515 rescnt -= 2;
9516 width -= 2;
9517 if (width < 0)
9518 width = 0;
9519 len -= 2;
9520 }
9521 if (width > len && !(flags & F_LJUST)) {
9522 do {
9523 --rescnt;
9524 *res++ = fill;
9525 } while (--width > len);
9526 }
9527 if (fill == ' ') {
9528 if (sign)
9529 *res++ = sign;
9530 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9531 assert(pbuf[0] == '0');
9532 assert(pbuf[1] == c);
9533 *res++ = *pbuf++;
9534 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009535 }
9536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 Py_UNICODE_COPY(res, pbuf, len);
9538 res += len;
9539 rescnt -= len;
9540 while (--width >= len) {
9541 --rescnt;
9542 *res++ = ' ';
9543 }
9544 if (dict && (argidx < arglen) && c != '%') {
9545 PyErr_SetString(PyExc_TypeError,
9546 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009547 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 goto onError;
9549 }
9550 Py_XDECREF(temp);
9551 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552 } /* until end */
9553 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 PyErr_SetString(PyExc_TypeError,
9555 "not all arguments converted during string formatting");
9556 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557 }
9558
Thomas Woutersa96affe2006-03-12 00:29:36 +00009559 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009560 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563 }
9564 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565 return (PyObject *)result;
9566
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568 Py_XDECREF(result);
9569 Py_DECREF(uformat);
9570 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572 }
9573 return NULL;
9574}
9575
Jeremy Hylton938ace62002-07-17 16:30:39 +00009576static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009577unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9578
Tim Peters6d6c1a32001-08-02 04:15:00 +00009579static PyObject *
9580unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9581{
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009583 static char *kwlist[] = {"object", "encoding", "errors", 0};
9584 char *encoding = NULL;
9585 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009586
Benjamin Peterson14339b62009-01-31 16:36:08 +00009587 if (type != &PyUnicode_Type)
9588 return unicode_subtype_new(type, args, kwds);
9589 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009590 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009591 return NULL;
9592 if (x == NULL)
9593 return (PyObject *)_PyUnicode_New(0);
9594 if (encoding == NULL && errors == NULL)
9595 return PyObject_Str(x);
9596 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009597 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009598}
9599
Guido van Rossume023fe02001-08-30 03:12:59 +00009600static PyObject *
9601unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9602{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009603 PyUnicodeObject *tmp, *pnew;
9604 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009605
Benjamin Peterson14339b62009-01-31 16:36:08 +00009606 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9607 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9608 if (tmp == NULL)
9609 return NULL;
9610 assert(PyUnicode_Check(tmp));
9611 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9612 if (pnew == NULL) {
9613 Py_DECREF(tmp);
9614 return NULL;
9615 }
9616 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9617 if (pnew->str == NULL) {
9618 _Py_ForgetReference((PyObject *)pnew);
9619 PyObject_Del(pnew);
9620 Py_DECREF(tmp);
9621 return PyErr_NoMemory();
9622 }
9623 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9624 pnew->length = n;
9625 pnew->hash = tmp->hash;
9626 Py_DECREF(tmp);
9627 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009628}
9629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009630PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009631 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009632\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009633Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009634encoding defaults to the current default string encoding.\n\
9635errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009636
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009637static PyObject *unicode_iter(PyObject *seq);
9638
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009640 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009641 "str", /* tp_name */
9642 sizeof(PyUnicodeObject), /* tp_size */
9643 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009645 (destructor)unicode_dealloc, /* tp_dealloc */
9646 0, /* tp_print */
9647 0, /* tp_getattr */
9648 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009649 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009650 unicode_repr, /* tp_repr */
9651 &unicode_as_number, /* tp_as_number */
9652 &unicode_as_sequence, /* tp_as_sequence */
9653 &unicode_as_mapping, /* tp_as_mapping */
9654 (hashfunc) unicode_hash, /* tp_hash*/
9655 0, /* tp_call*/
9656 (reprfunc) unicode_str, /* tp_str */
9657 PyObject_GenericGetAttr, /* tp_getattro */
9658 0, /* tp_setattro */
9659 0, /* tp_as_buffer */
9660 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009662 unicode_doc, /* tp_doc */
9663 0, /* tp_traverse */
9664 0, /* tp_clear */
9665 PyUnicode_RichCompare, /* tp_richcompare */
9666 0, /* tp_weaklistoffset */
9667 unicode_iter, /* tp_iter */
9668 0, /* tp_iternext */
9669 unicode_methods, /* tp_methods */
9670 0, /* tp_members */
9671 0, /* tp_getset */
9672 &PyBaseObject_Type, /* tp_base */
9673 0, /* tp_dict */
9674 0, /* tp_descr_get */
9675 0, /* tp_descr_set */
9676 0, /* tp_dictoffset */
9677 0, /* tp_init */
9678 0, /* tp_alloc */
9679 unicode_new, /* tp_new */
9680 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681};
9682
9683/* Initialize the Unicode implementation */
9684
Thomas Wouters78890102000-07-22 19:25:51 +00009685void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009687 int i;
9688
Thomas Wouters477c8d52006-05-27 19:21:47 +00009689 /* XXX - move this array to unicodectype.c ? */
9690 Py_UNICODE linebreak[] = {
9691 0x000A, /* LINE FEED */
9692 0x000D, /* CARRIAGE RETURN */
9693 0x001C, /* FILE SEPARATOR */
9694 0x001D, /* GROUP SEPARATOR */
9695 0x001E, /* RECORD SEPARATOR */
9696 0x0085, /* NEXT LINE */
9697 0x2028, /* LINE SEPARATOR */
9698 0x2029, /* PARAGRAPH SEPARATOR */
9699 };
9700
Fred Drakee4315f52000-05-09 19:53:39 +00009701 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009702 free_list = NULL;
9703 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009705 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009706 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009707
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009708 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009709 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009710 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009712
9713 /* initialize the linebreak bloom filter */
9714 bloom_linebreak = make_bloom_mask(
9715 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9716 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009717
9718 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719}
9720
9721/* Finalize the Unicode implementation */
9722
Christian Heimesa156e092008-02-16 07:38:31 +00009723int
9724PyUnicode_ClearFreeList(void)
9725{
9726 int freelist_size = numfree;
9727 PyUnicodeObject *u;
9728
9729 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009730 PyUnicodeObject *v = u;
9731 u = *(PyUnicodeObject **)u;
9732 if (v->str)
9733 PyObject_DEL(v->str);
9734 Py_XDECREF(v->defenc);
9735 PyObject_Del(v);
9736 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009737 }
9738 free_list = NULL;
9739 assert(numfree == 0);
9740 return freelist_size;
9741}
9742
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743void
Thomas Wouters78890102000-07-22 19:25:51 +00009744_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009746 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009748 Py_XDECREF(unicode_empty);
9749 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009750
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009751 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009752 if (unicode_latin1[i]) {
9753 Py_DECREF(unicode_latin1[i]);
9754 unicode_latin1[i] = NULL;
9755 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009756 }
Christian Heimesa156e092008-02-16 07:38:31 +00009757 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009759
Walter Dörwald16807132007-05-25 13:52:07 +00009760void
9761PyUnicode_InternInPlace(PyObject **p)
9762{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009763 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9764 PyObject *t;
9765 if (s == NULL || !PyUnicode_Check(s))
9766 Py_FatalError(
9767 "PyUnicode_InternInPlace: unicode strings only please!");
9768 /* If it's a subclass, we don't really know what putting
9769 it in the interned dict might do. */
9770 if (!PyUnicode_CheckExact(s))
9771 return;
9772 if (PyUnicode_CHECK_INTERNED(s))
9773 return;
9774 if (interned == NULL) {
9775 interned = PyDict_New();
9776 if (interned == NULL) {
9777 PyErr_Clear(); /* Don't leave an exception */
9778 return;
9779 }
9780 }
9781 /* It might be that the GetItem call fails even
9782 though the key is present in the dictionary,
9783 namely when this happens during a stack overflow. */
9784 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009785 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009786 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009787
Benjamin Peterson29060642009-01-31 22:14:21 +00009788 if (t) {
9789 Py_INCREF(t);
9790 Py_DECREF(*p);
9791 *p = t;
9792 return;
9793 }
Walter Dörwald16807132007-05-25 13:52:07 +00009794
Benjamin Peterson14339b62009-01-31 16:36:08 +00009795 PyThreadState_GET()->recursion_critical = 1;
9796 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9797 PyErr_Clear();
9798 PyThreadState_GET()->recursion_critical = 0;
9799 return;
9800 }
9801 PyThreadState_GET()->recursion_critical = 0;
9802 /* The two references in interned are not counted by refcnt.
9803 The deallocator will take care of this */
9804 Py_REFCNT(s) -= 2;
9805 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009806}
9807
9808void
9809PyUnicode_InternImmortal(PyObject **p)
9810{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009811 PyUnicode_InternInPlace(p);
9812 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9813 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9814 Py_INCREF(*p);
9815 }
Walter Dörwald16807132007-05-25 13:52:07 +00009816}
9817
9818PyObject *
9819PyUnicode_InternFromString(const char *cp)
9820{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009821 PyObject *s = PyUnicode_FromString(cp);
9822 if (s == NULL)
9823 return NULL;
9824 PyUnicode_InternInPlace(&s);
9825 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009826}
9827
9828void _Py_ReleaseInternedUnicodeStrings(void)
9829{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009830 PyObject *keys;
9831 PyUnicodeObject *s;
9832 Py_ssize_t i, n;
9833 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009834
Benjamin Peterson14339b62009-01-31 16:36:08 +00009835 if (interned == NULL || !PyDict_Check(interned))
9836 return;
9837 keys = PyDict_Keys(interned);
9838 if (keys == NULL || !PyList_Check(keys)) {
9839 PyErr_Clear();
9840 return;
9841 }
Walter Dörwald16807132007-05-25 13:52:07 +00009842
Benjamin Peterson14339b62009-01-31 16:36:08 +00009843 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9844 detector, interned unicode strings are not forcibly deallocated;
9845 rather, we give them their stolen references back, and then clear
9846 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009847
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848 n = PyList_GET_SIZE(keys);
9849 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009850 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009851 for (i = 0; i < n; i++) {
9852 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9853 switch (s->state) {
9854 case SSTATE_NOT_INTERNED:
9855 /* XXX Shouldn't happen */
9856 break;
9857 case SSTATE_INTERNED_IMMORTAL:
9858 Py_REFCNT(s) += 1;
9859 immortal_size += s->length;
9860 break;
9861 case SSTATE_INTERNED_MORTAL:
9862 Py_REFCNT(s) += 2;
9863 mortal_size += s->length;
9864 break;
9865 default:
9866 Py_FatalError("Inconsistent interned string state.");
9867 }
9868 s->state = SSTATE_NOT_INTERNED;
9869 }
9870 fprintf(stderr, "total size of all interned strings: "
9871 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9872 "mortal/immortal\n", mortal_size, immortal_size);
9873 Py_DECREF(keys);
9874 PyDict_Clear(interned);
9875 Py_DECREF(interned);
9876 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009877}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009878
9879
9880/********************* Unicode Iterator **************************/
9881
9882typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009883 PyObject_HEAD
9884 Py_ssize_t it_index;
9885 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009886} unicodeiterobject;
9887
9888static void
9889unicodeiter_dealloc(unicodeiterobject *it)
9890{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009891 _PyObject_GC_UNTRACK(it);
9892 Py_XDECREF(it->it_seq);
9893 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009894}
9895
9896static int
9897unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9898{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009899 Py_VISIT(it->it_seq);
9900 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009901}
9902
9903static PyObject *
9904unicodeiter_next(unicodeiterobject *it)
9905{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009906 PyUnicodeObject *seq;
9907 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009908
Benjamin Peterson14339b62009-01-31 16:36:08 +00009909 assert(it != NULL);
9910 seq = it->it_seq;
9911 if (seq == NULL)
9912 return NULL;
9913 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009914
Benjamin Peterson14339b62009-01-31 16:36:08 +00009915 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9916 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 if (item != NULL)
9919 ++it->it_index;
9920 return item;
9921 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009922
Benjamin Peterson14339b62009-01-31 16:36:08 +00009923 Py_DECREF(seq);
9924 it->it_seq = NULL;
9925 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009926}
9927
9928static PyObject *
9929unicodeiter_len(unicodeiterobject *it)
9930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009931 Py_ssize_t len = 0;
9932 if (it->it_seq)
9933 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9934 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009935}
9936
9937PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9938
9939static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009940 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009941 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009942 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009943};
9944
9945PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009946 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9947 "str_iterator", /* tp_name */
9948 sizeof(unicodeiterobject), /* tp_basicsize */
9949 0, /* tp_itemsize */
9950 /* methods */
9951 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9952 0, /* tp_print */
9953 0, /* tp_getattr */
9954 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009955 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009956 0, /* tp_repr */
9957 0, /* tp_as_number */
9958 0, /* tp_as_sequence */
9959 0, /* tp_as_mapping */
9960 0, /* tp_hash */
9961 0, /* tp_call */
9962 0, /* tp_str */
9963 PyObject_GenericGetAttr, /* tp_getattro */
9964 0, /* tp_setattro */
9965 0, /* tp_as_buffer */
9966 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9967 0, /* tp_doc */
9968 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9969 0, /* tp_clear */
9970 0, /* tp_richcompare */
9971 0, /* tp_weaklistoffset */
9972 PyObject_SelfIter, /* tp_iter */
9973 (iternextfunc)unicodeiter_next, /* tp_iternext */
9974 unicodeiter_methods, /* tp_methods */
9975 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009976};
9977
9978static PyObject *
9979unicode_iter(PyObject *seq)
9980{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009981 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009982
Benjamin Peterson14339b62009-01-31 16:36:08 +00009983 if (!PyUnicode_Check(seq)) {
9984 PyErr_BadInternalCall();
9985 return NULL;
9986 }
9987 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9988 if (it == NULL)
9989 return NULL;
9990 it->it_index = 0;
9991 Py_INCREF(seq);
9992 it->it_seq = (PyUnicodeObject *)seq;
9993 _PyObject_GC_TRACK(it);
9994 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009995}
9996
Martin v. Löwis5b222132007-06-10 09:51:05 +00009997size_t
9998Py_UNICODE_strlen(const Py_UNICODE *u)
9999{
10000 int res = 0;
10001 while(*u++)
10002 res++;
10003 return res;
10004}
10005
10006Py_UNICODE*
10007Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10008{
10009 Py_UNICODE *u = s1;
10010 while ((*u++ = *s2++));
10011 return s1;
10012}
10013
10014Py_UNICODE*
10015Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10016{
10017 Py_UNICODE *u = s1;
10018 while ((*u++ = *s2++))
10019 if (n-- == 0)
10020 break;
10021 return s1;
10022}
10023
10024int
10025Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10026{
10027 while (*s1 && *s2 && *s1 == *s2)
10028 s1++, s2++;
10029 if (*s1 && *s2)
10030 return (*s1 < *s2) ? -1 : +1;
10031 if (*s1)
10032 return 1;
10033 if (*s2)
10034 return -1;
10035 return 0;
10036}
10037
10038Py_UNICODE*
10039Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10040{
10041 const Py_UNICODE *p;
10042 for (p = s; *p; p++)
10043 if (*p == c)
10044 return (Py_UNICODE*)p;
10045 return NULL;
10046}
10047
10048
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010049#ifdef __cplusplus
10050}
10051#endif
10052
10053
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010054/*
Benjamin Peterson29060642009-01-31 22:14:21 +000010055 Local variables:
10056 c-basic-offset: 4
10057 indent-tabs-mode: nil
10058 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010059*/