blob: ae401a6b62777d07a6a687ca03e35ca4176007d7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
1296PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 Py_ssize_t size,
1298 const char *encoding,
1299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001302 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001303 char lower[20]; /* Enough for any encoding name we recognize */
1304 char *l;
1305 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306
1307 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 encoding = PyUnicode_GetDefaultEncoding();
1309
1310 /* Convert encoding to lower case and replace '_' with '-' in order to
1311 catch e.g. UTF_8 */
1312 e = encoding;
1313 l = lower;
1314 while (*e && l < &lower[(sizeof lower) - 2]) {
1315 if (ISUPPER(*e)) {
1316 *l++ = TOLOWER(*e++);
1317 }
1318 else if (*e == '_') {
1319 *l++ = '-';
1320 e++;
1321 }
1322 else {
1323 *l++ = *e++;
1324 }
1325 }
1326 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001327
1328 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001329 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001331 else if ((strcmp(lower, "latin-1") == 0) ||
1332 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001333 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001334#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001335 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001336 return PyUnicode_DecodeMBCS(s, size, errors);
1337#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001338 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001339 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001340 else if (strcmp(lower, "utf-16") == 0)
1341 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1342 else if (strcmp(lower, "utf-32") == 0)
1343 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344
1345 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001346 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001347 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001348 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001349 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 if (buffer == NULL)
1351 goto onError;
1352 unicode = PyCodec_Decode(buffer, encoding, errors);
1353 if (unicode == NULL)
1354 goto onError;
1355 if (!PyUnicode_Check(unicode)) {
1356 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001357 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001358 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 Py_DECREF(unicode);
1360 goto onError;
1361 }
1362 Py_DECREF(buffer);
1363 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001364
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 Py_XDECREF(buffer);
1367 return NULL;
1368}
1369
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001370PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1371 const char *encoding,
1372 const char *errors)
1373{
1374 PyObject *v;
1375
1376 if (!PyUnicode_Check(unicode)) {
1377 PyErr_BadArgument();
1378 goto onError;
1379 }
1380
1381 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001382 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001383
1384 /* Decode via the codec registry */
1385 v = PyCodec_Decode(unicode, encoding, errors);
1386 if (v == NULL)
1387 goto onError;
1388 return v;
1389
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001391 return NULL;
1392}
1393
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001394PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1395 const char *encoding,
1396 const char *errors)
1397{
1398 PyObject *v;
1399
1400 if (!PyUnicode_Check(unicode)) {
1401 PyErr_BadArgument();
1402 goto onError;
1403 }
1404
1405 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001406 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001407
1408 /* Decode via the codec registry */
1409 v = PyCodec_Decode(unicode, encoding, errors);
1410 if (v == NULL)
1411 goto onError;
1412 if (!PyUnicode_Check(v)) {
1413 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001414 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001415 Py_TYPE(v)->tp_name);
1416 Py_DECREF(v);
1417 goto onError;
1418 }
1419 return v;
1420
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422 return NULL;
1423}
1424
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 Py_ssize_t size,
1427 const char *encoding,
1428 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
1430 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001431
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432 unicode = PyUnicode_FromUnicode(s, size);
1433 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1436 Py_DECREF(unicode);
1437 return v;
1438}
1439
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001440PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1441 const char *encoding,
1442 const char *errors)
1443{
1444 PyObject *v;
1445
1446 if (!PyUnicode_Check(unicode)) {
1447 PyErr_BadArgument();
1448 goto onError;
1449 }
1450
1451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001453
1454 /* Encode via the codec registry */
1455 v = PyCodec_Encode(unicode, encoding, errors);
1456 if (v == NULL)
1457 goto onError;
1458 return v;
1459
Benjamin Peterson29060642009-01-31 22:14:21 +00001460 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001461 return NULL;
1462}
1463
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1465 const char *encoding,
1466 const char *errors)
1467{
1468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001469
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470 if (!PyUnicode_Check(unicode)) {
1471 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473 }
Fred Drakee4315f52000-05-09 19:53:39 +00001474
Tim Petersced69f82003-09-16 20:30:58 +00001475 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001476 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001477
1478 /* Shortcuts for common default encodings */
1479 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 if (strcmp(encoding, "utf-8") == 0)
1481 return PyUnicode_AsUTF8String(unicode);
1482 else if (strcmp(encoding, "latin-1") == 0)
1483 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001484#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 else if (strcmp(encoding, "mbcs") == 0)
1486 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001487#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 else if (strcmp(encoding, "ascii") == 0)
1489 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001490 /* During bootstrap, we may need to find the encodings
1491 package, to load the file system encoding, and require the
1492 file system encoding in order to load the encodings
1493 package.
1494
1495 Break out of this dependency by assuming that the path to
1496 the encodings module is ASCII-only. XXX could try wcstombs
1497 instead, if the file system encoding is the locale's
1498 encoding. */
1499 else if (Py_FileSystemDefaultEncoding &&
1500 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1501 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001502 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504
1505 /* Encode via the codec registry */
1506 v = PyCodec_Encode(unicode, encoding, errors);
1507 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001508 return NULL;
1509
1510 /* The normal path */
1511 if (PyBytes_Check(v))
1512 return v;
1513
1514 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001515 if (PyByteArray_Check(v)) {
1516 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001517 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001518 PyOS_snprintf(msg, sizeof(msg),
1519 "encoder %s returned buffer instead of bytes",
1520 encoding);
1521 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001522 Py_DECREF(v);
1523 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001524 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001525
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001526 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1527 Py_DECREF(v);
1528 return b;
1529 }
1530
1531 PyErr_Format(PyExc_TypeError,
1532 "encoder did not return a bytes object (type=%.400s)",
1533 Py_TYPE(v)->tp_name);
1534 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001535 return NULL;
1536}
1537
1538PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1539 const char *encoding,
1540 const char *errors)
1541{
1542 PyObject *v;
1543
1544 if (!PyUnicode_Check(unicode)) {
1545 PyErr_BadArgument();
1546 goto onError;
1547 }
1548
1549 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001550 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551
1552 /* Encode via the codec registry */
1553 v = PyCodec_Encode(unicode, encoding, errors);
1554 if (v == NULL)
1555 goto onError;
1556 if (!PyUnicode_Check(v)) {
1557 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001558 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001559 Py_TYPE(v)->tp_name);
1560 Py_DECREF(v);
1561 goto onError;
1562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001564
Benjamin Peterson29060642009-01-31 22:14:21 +00001565 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 return NULL;
1567}
1568
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001569PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001571{
1572 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001573 if (v)
1574 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001575 if (errors != NULL)
1576 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001577 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001578 PyUnicode_GET_SIZE(unicode),
1579 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001580 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001581 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001582 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001583 return v;
1584}
1585
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001586PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001587PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001588 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001589 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1590}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001591
Christian Heimes5894ba72007-11-04 11:43:14 +00001592PyObject*
1593PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1594{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001595 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1596 can be undefined. If it is case, decode using UTF-8. The following assumes
1597 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1598 bootstrapping process where the codecs aren't ready yet.
1599 */
1600 if (Py_FileSystemDefaultEncoding) {
1601#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001602 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001603 return PyUnicode_DecodeMBCS(s, size, "replace");
1604 }
1605#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001606 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001607 return PyUnicode_DecodeUTF8(s, size, "replace");
1608 }
1609#endif
1610 return PyUnicode_Decode(s, size,
1611 Py_FileSystemDefaultEncoding,
1612 "replace");
1613 }
1614 else {
1615 return PyUnicode_DecodeUTF8(s, size, "replace");
1616 }
1617}
1618
Martin v. Löwis011e8422009-05-05 04:43:17 +00001619/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001620 system encoding. The addr param must be a PyObject**.
1621 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001622
1623int
1624PyUnicode_FSConverter(PyObject* arg, void* addr)
1625{
1626 PyObject *output = NULL;
1627 Py_ssize_t size;
1628 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001629 if (arg == NULL) {
1630 Py_DECREF(*(PyObject**)addr);
1631 return 1;
1632 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001633 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001634 output = arg;
1635 Py_INCREF(output);
1636 }
1637 else {
1638 arg = PyUnicode_FromObject(arg);
1639 if (!arg)
1640 return 0;
1641 output = PyUnicode_AsEncodedObject(arg,
1642 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001643 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001644 Py_DECREF(arg);
1645 if (!output)
1646 return 0;
1647 if (!PyBytes_Check(output)) {
1648 Py_DECREF(output);
1649 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1650 return 0;
1651 }
1652 }
1653 if (PyBytes_Check(output)) {
1654 size = PyBytes_GET_SIZE(output);
1655 data = PyBytes_AS_STRING(output);
1656 }
1657 else {
1658 size = PyByteArray_GET_SIZE(output);
1659 data = PyByteArray_AS_STRING(output);
1660 }
1661 if (size != strlen(data)) {
1662 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1663 Py_DECREF(output);
1664 return 0;
1665 }
1666 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001667 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001668}
1669
1670
Martin v. Löwis5b222132007-06-10 09:51:05 +00001671char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001672_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001673{
Christian Heimesf3863112007-11-22 07:46:41 +00001674 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001675 if (!PyUnicode_Check(unicode)) {
1676 PyErr_BadArgument();
1677 return NULL;
1678 }
Christian Heimesf3863112007-11-22 07:46:41 +00001679 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1680 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001681 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001682 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001683 *psize = PyBytes_GET_SIZE(bytes);
1684 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001685}
1686
1687char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001688_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001689{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001690 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001691}
1692
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1694{
1695 if (!PyUnicode_Check(unicode)) {
1696 PyErr_BadArgument();
1697 goto onError;
1698 }
1699 return PyUnicode_AS_UNICODE(unicode);
1700
Benjamin Peterson29060642009-01-31 22:14:21 +00001701 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702 return NULL;
1703}
1704
Martin v. Löwis18e16552006-02-15 17:27:45 +00001705Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706{
1707 if (!PyUnicode_Check(unicode)) {
1708 PyErr_BadArgument();
1709 goto onError;
1710 }
1711 return PyUnicode_GET_SIZE(unicode);
1712
Benjamin Peterson29060642009-01-31 22:14:21 +00001713 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 return -1;
1715}
1716
Thomas Wouters78890102000-07-22 19:25:51 +00001717const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001718{
1719 return unicode_default_encoding;
1720}
1721
1722int PyUnicode_SetDefaultEncoding(const char *encoding)
1723{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001724 if (strcmp(encoding, unicode_default_encoding) != 0) {
1725 PyErr_Format(PyExc_ValueError,
1726 "Can only set default encoding to %s",
1727 unicode_default_encoding);
1728 return -1;
1729 }
Fred Drakee4315f52000-05-09 19:53:39 +00001730 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001731}
1732
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001733/* error handling callback helper:
1734 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001735 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001736 and adjust various state variables.
1737 return 0 on success, -1 on error
1738*/
1739
1740static
1741int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001742 const char *encoding, const char *reason,
1743 const char **input, const char **inend, Py_ssize_t *startinpos,
1744 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1745 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001747 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748
1749 PyObject *restuple = NULL;
1750 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001751 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001752 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001753 Py_ssize_t requiredsize;
1754 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001755 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001756 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001757 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001758 int res = -1;
1759
1760 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001761 *errorHandler = PyCodec_LookupError(errors);
1762 if (*errorHandler == NULL)
1763 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001764 }
1765
1766 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001767 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001768 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1769 if (*exceptionObject == NULL)
1770 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001771 }
1772 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001773 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1774 goto onError;
1775 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1776 goto onError;
1777 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1778 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 }
1780
1781 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1782 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001784 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001785 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001787 }
1788 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001789 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001790
1791 /* Copy back the bytes variables, which might have been modified by the
1792 callback */
1793 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1794 if (!inputobj)
1795 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001796 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001797 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001798 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001799 *input = PyBytes_AS_STRING(inputobj);
1800 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001801 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001802 /* we can DECREF safely, as the exception has another reference,
1803 so the object won't go away. */
1804 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001805
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001807 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001808 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001809 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1810 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001811 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812
1813 /* need more space? (at least enough for what we
1814 have+the replacement+the rest of the string (starting
1815 at the new input position), so we won't have to check space
1816 when there are no errors in the rest of the string) */
1817 repptr = PyUnicode_AS_UNICODE(repunicode);
1818 repsize = PyUnicode_GET_SIZE(repunicode);
1819 requiredsize = *outpos + repsize + insize-newpos;
1820 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001821 if (requiredsize<2*outsize)
1822 requiredsize = 2*outsize;
1823 if (_PyUnicode_Resize(output, requiredsize) < 0)
1824 goto onError;
1825 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 }
1827 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001828 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829 Py_UNICODE_COPY(*outptr, repptr, repsize);
1830 *outptr += repsize;
1831 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 /* we made it! */
1834 res = 0;
1835
Benjamin Peterson29060642009-01-31 22:14:21 +00001836 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 Py_XDECREF(restuple);
1838 return res;
1839}
1840
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001841/* --- UTF-7 Codec -------------------------------------------------------- */
1842
Antoine Pitrou244651a2009-05-04 18:56:13 +00001843/* See RFC2152 for details. We encode conservatively and decode liberally. */
1844
1845/* Three simple macros defining base-64. */
1846
1847/* Is c a base-64 character? */
1848
1849#define IS_BASE64(c) \
1850 (((c) >= 'A' && (c) <= 'Z') || \
1851 ((c) >= 'a' && (c) <= 'z') || \
1852 ((c) >= '0' && (c) <= '9') || \
1853 (c) == '+' || (c) == '/')
1854
1855/* given that c is a base-64 character, what is its base-64 value? */
1856
1857#define FROM_BASE64(c) \
1858 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1859 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1860 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1861 (c) == '+' ? 62 : 63)
1862
1863/* What is the base-64 character of the bottom 6 bits of n? */
1864
1865#define TO_BASE64(n) \
1866 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1867
1868/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1869 * decoded as itself. We are permissive on decoding; the only ASCII
1870 * byte not decoding to itself is the + which begins a base64
1871 * string. */
1872
1873#define DECODE_DIRECT(c) \
1874 ((c) <= 127 && (c) != '+')
1875
1876/* The UTF-7 encoder treats ASCII characters differently according to
1877 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1878 * the above). See RFC2152. This array identifies these different
1879 * sets:
1880 * 0 : "Set D"
1881 * alphanumeric and '(),-./:?
1882 * 1 : "Set O"
1883 * !"#$%&*;<=>@[]^_`{|}
1884 * 2 : "whitespace"
1885 * ht nl cr sp
1886 * 3 : special (must be base64 encoded)
1887 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1888 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001889
Tim Petersced69f82003-09-16 20:30:58 +00001890static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001891char utf7_category[128] = {
1892/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1893 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1894/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1895 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1896/* sp ! " # $ % & ' ( ) * + , - . / */
1897 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1898/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1900/* @ A B C D E F G H I J K L M N O */
1901 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1902/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1904/* ` a b c d e f g h i j k l m n o */
1905 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1906/* p q r s t u v w x y z { | } ~ del */
1907 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001908};
1909
Antoine Pitrou244651a2009-05-04 18:56:13 +00001910/* ENCODE_DIRECT: this character should be encoded as itself. The
1911 * answer depends on whether we are encoding set O as itself, and also
1912 * on whether we are encoding whitespace as itself. RFC2152 makes it
1913 * clear that the answers to these questions vary between
1914 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001915
Antoine Pitrou244651a2009-05-04 18:56:13 +00001916#define ENCODE_DIRECT(c, directO, directWS) \
1917 ((c) < 128 && (c) > 0 && \
1918 ((utf7_category[(c)] == 0) || \
1919 (directWS && (utf7_category[(c)] == 2)) || \
1920 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001921
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001922PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001923 Py_ssize_t size,
1924 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001925{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001926 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1927}
1928
Antoine Pitrou244651a2009-05-04 18:56:13 +00001929/* The decoder. The only state we preserve is our read position,
1930 * i.e. how many characters we have consumed. So if we end in the
1931 * middle of a shift sequence we have to back off the read position
1932 * and the output to the beginning of the sequence, otherwise we lose
1933 * all the shift state (seen bits, number of bits seen, high
1934 * surrogate). */
1935
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001936PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001937 Py_ssize_t size,
1938 const char *errors,
1939 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001940{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001941 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001942 Py_ssize_t startinpos;
1943 Py_ssize_t endinpos;
1944 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001945 const char *e;
1946 PyUnicodeObject *unicode;
1947 Py_UNICODE *p;
1948 const char *errmsg = "";
1949 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001950 Py_UNICODE *shiftOutStart;
1951 unsigned int base64bits = 0;
1952 unsigned long base64buffer = 0;
1953 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001954 PyObject *errorHandler = NULL;
1955 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956
1957 unicode = _PyUnicode_New(size);
1958 if (!unicode)
1959 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001960 if (size == 0) {
1961 if (consumed)
1962 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001963 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001964 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965
1966 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001968 e = s + size;
1969
1970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001972 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001973 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001974
Antoine Pitrou244651a2009-05-04 18:56:13 +00001975 if (inShift) { /* in a base-64 section */
1976 if (IS_BASE64(ch)) { /* consume a base-64 character */
1977 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1978 base64bits += 6;
1979 s++;
1980 if (base64bits >= 16) {
1981 /* we have enough bits for a UTF-16 value */
1982 Py_UNICODE outCh = (Py_UNICODE)
1983 (base64buffer >> (base64bits-16));
1984 base64bits -= 16;
1985 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1986 if (surrogate) {
1987 /* expecting a second surrogate */
1988 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1989#ifdef Py_UNICODE_WIDE
1990 *p++ = (((surrogate & 0x3FF)<<10)
1991 | (outCh & 0x3FF)) + 0x10000;
1992#else
1993 *p++ = surrogate;
1994 *p++ = outCh;
1995#endif
1996 surrogate = 0;
1997 }
1998 else {
1999 surrogate = 0;
2000 errmsg = "second surrogate missing";
2001 goto utf7Error;
2002 }
2003 }
2004 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2005 /* first surrogate */
2006 surrogate = outCh;
2007 }
2008 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2009 errmsg = "unexpected second surrogate";
2010 goto utf7Error;
2011 }
2012 else {
2013 *p++ = outCh;
2014 }
2015 }
2016 }
2017 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002018 inShift = 0;
2019 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002020 if (surrogate) {
2021 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002022 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002023 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002024 if (base64bits > 0) { /* left-over bits */
2025 if (base64bits >= 6) {
2026 /* We've seen at least one base-64 character */
2027 errmsg = "partial character in shift sequence";
2028 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002030 else {
2031 /* Some bits remain; they should be zero */
2032 if (base64buffer != 0) {
2033 errmsg = "non-zero padding bits in shift sequence";
2034 goto utf7Error;
2035 }
2036 }
2037 }
2038 if (ch != '-') {
2039 /* '-' is absorbed; other terminating
2040 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002041 *p++ = ch;
2042 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002043 }
2044 }
2045 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002046 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002047 s++; /* consume '+' */
2048 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002049 s++;
2050 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002051 }
2052 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002053 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002054 shiftOutStart = p;
2055 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002056 }
2057 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002058 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002059 *p++ = ch;
2060 s++;
2061 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002062 else {
2063 startinpos = s-starts;
2064 s++;
2065 errmsg = "unexpected special character";
2066 goto utf7Error;
2067 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002068 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002069utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 outpos = p-PyUnicode_AS_UNICODE(unicode);
2071 endinpos = s-starts;
2072 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002073 errors, &errorHandler,
2074 "utf7", errmsg,
2075 &starts, &e, &startinpos, &endinpos, &exc, &s,
2076 &unicode, &outpos, &p))
2077 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078 }
2079
Antoine Pitrou244651a2009-05-04 18:56:13 +00002080 /* end of string */
2081
2082 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2083 /* if we're in an inconsistent state, that's an error */
2084 if (surrogate ||
2085 (base64bits >= 6) ||
2086 (base64bits > 0 && base64buffer != 0)) {
2087 outpos = p-PyUnicode_AS_UNICODE(unicode);
2088 endinpos = size;
2089 if (unicode_decode_call_errorhandler(
2090 errors, &errorHandler,
2091 "utf7", "unterminated shift sequence",
2092 &starts, &e, &startinpos, &endinpos, &exc, &s,
2093 &unicode, &outpos, &p))
2094 goto onError;
2095 if (s < e)
2096 goto restart;
2097 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002098 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002099
2100 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002101 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002102 if (inShift) {
2103 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002104 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002105 }
2106 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002107 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002108 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002109 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002110
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002111 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002112 goto onError;
2113
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002114 Py_XDECREF(errorHandler);
2115 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002116 return (PyObject *)unicode;
2117
Benjamin Peterson29060642009-01-31 22:14:21 +00002118 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 Py_XDECREF(errorHandler);
2120 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002121 Py_DECREF(unicode);
2122 return NULL;
2123}
2124
2125
2126PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002127 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002128 int base64SetO,
2129 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002130 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002131{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002132 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002134 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002136 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137 unsigned int base64bits = 0;
2138 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002139 char * out;
2140 char * start;
2141
2142 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002143 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002144
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002145 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002146 return PyErr_NoMemory();
2147
Antoine Pitrou244651a2009-05-04 18:56:13 +00002148 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002149 if (v == NULL)
2150 return NULL;
2151
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002152 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002153 for (;i < size; ++i) {
2154 Py_UNICODE ch = s[i];
2155
Antoine Pitrou244651a2009-05-04 18:56:13 +00002156 if (inShift) {
2157 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2158 /* shifting out */
2159 if (base64bits) { /* output remaining bits */
2160 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2161 base64buffer = 0;
2162 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163 }
2164 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002165 /* Characters not in the BASE64 set implicitly unshift the sequence
2166 so no '-' is required, except if the character is itself a '-' */
2167 if (IS_BASE64(ch) || ch == '-') {
2168 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002170 *out++ = (char) ch;
2171 }
2172 else {
2173 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002174 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002175 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002176 else { /* not in a shift sequence */
2177 if (ch == '+') {
2178 *out++ = '+';
2179 *out++ = '-';
2180 }
2181 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2182 *out++ = (char) ch;
2183 }
2184 else {
2185 *out++ = '+';
2186 inShift = 1;
2187 goto encode_char;
2188 }
2189 }
2190 continue;
2191encode_char:
2192#ifdef Py_UNICODE_WIDE
2193 if (ch >= 0x10000) {
2194 /* code first surrogate */
2195 base64bits += 16;
2196 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2197 while (base64bits >= 6) {
2198 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2199 base64bits -= 6;
2200 }
2201 /* prepare second surrogate */
2202 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2203 }
2204#endif
2205 base64bits += 16;
2206 base64buffer = (base64buffer << 16) | ch;
2207 while (base64bits >= 6) {
2208 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2209 base64bits -= 6;
2210 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002211 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002212 if (base64bits)
2213 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2214 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002215 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002216 if (_PyBytes_Resize(&v, out - start) < 0)
2217 return NULL;
2218 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002219}
2220
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221#undef IS_BASE64
2222#undef FROM_BASE64
2223#undef TO_BASE64
2224#undef DECODE_DIRECT
2225#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227/* --- UTF-8 Codec -------------------------------------------------------- */
2228
Tim Petersced69f82003-09-16 20:30:58 +00002229static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230char utf8_code_length[256] = {
2231 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2232 illegal prefix. see RFC 2279 for details */
2233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2243 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2244 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2245 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2246 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2247 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2248 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2249};
2250
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002252 Py_ssize_t size,
2253 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254{
Walter Dörwald69652032004-09-07 20:24:22 +00002255 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2256}
2257
Antoine Pitrouab868312009-01-10 15:40:25 +00002258/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2259#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2260
2261/* Mask to quickly check whether a C 'long' contains a
2262 non-ASCII, UTF8-encoded char. */
2263#if (SIZEOF_LONG == 8)
2264# define ASCII_CHAR_MASK 0x8080808080808080L
2265#elif (SIZEOF_LONG == 4)
2266# define ASCII_CHAR_MASK 0x80808080L
2267#else
2268# error C 'long' size should be either 4 or 8!
2269#endif
2270
Walter Dörwald69652032004-09-07 20:24:22 +00002271PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002272 Py_ssize_t size,
2273 const char *errors,
2274 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002275{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002278 Py_ssize_t startinpos;
2279 Py_ssize_t endinpos;
2280 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002281 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 PyUnicodeObject *unicode;
2283 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002284 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002285 PyObject *errorHandler = NULL;
2286 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287
2288 /* Note: size will always be longer than the resulting Unicode
2289 character count */
2290 unicode = _PyUnicode_New(size);
2291 if (!unicode)
2292 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002293 if (size == 0) {
2294 if (consumed)
2295 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298
2299 /* Unpack UTF-8 encoded data */
2300 p = unicode->str;
2301 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002302 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303
2304 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002305 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306
2307 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002308 /* Fast path for runs of ASCII characters. Given that common UTF-8
2309 input will consist of an overwhelming majority of ASCII
2310 characters, we try to optimize for this case by checking
2311 as many characters as a C 'long' can contain.
2312 First, check if we can do an aligned read, as most CPUs have
2313 a penalty for unaligned reads.
2314 */
2315 if (!((size_t) s & LONG_PTR_MASK)) {
2316 /* Help register allocation */
2317 register const char *_s = s;
2318 register Py_UNICODE *_p = p;
2319 while (_s < aligned_end) {
2320 /* Read a whole long at a time (either 4 or 8 bytes),
2321 and do a fast unrolled copy if it only contains ASCII
2322 characters. */
2323 unsigned long data = *(unsigned long *) _s;
2324 if (data & ASCII_CHAR_MASK)
2325 break;
2326 _p[0] = (unsigned char) _s[0];
2327 _p[1] = (unsigned char) _s[1];
2328 _p[2] = (unsigned char) _s[2];
2329 _p[3] = (unsigned char) _s[3];
2330#if (SIZEOF_LONG == 8)
2331 _p[4] = (unsigned char) _s[4];
2332 _p[5] = (unsigned char) _s[5];
2333 _p[6] = (unsigned char) _s[6];
2334 _p[7] = (unsigned char) _s[7];
2335#endif
2336 _s += SIZEOF_LONG;
2337 _p += SIZEOF_LONG;
2338 }
2339 s = _s;
2340 p = _p;
2341 if (s == e)
2342 break;
2343 ch = (unsigned char)*s;
2344 }
2345 }
2346
2347 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002348 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 s++;
2350 continue;
2351 }
2352
2353 n = utf8_code_length[ch];
2354
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002355 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002356 if (consumed)
2357 break;
2358 else {
2359 errmsg = "unexpected end of data";
2360 startinpos = s-starts;
2361 endinpos = size;
2362 goto utf8Error;
2363 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365
2366 switch (n) {
2367
2368 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002369 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002370 startinpos = s-starts;
2371 endinpos = startinpos+1;
2372 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
2374 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002375 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 startinpos = s-starts;
2377 endinpos = startinpos+1;
2378 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379
2380 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002381 if ((s[1] & 0xc0) != 0x80) {
2382 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002383 startinpos = s-starts;
2384 endinpos = startinpos+2;
2385 goto utf8Error;
2386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002388 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002389 startinpos = s-starts;
2390 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002391 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002392 goto utf8Error;
2393 }
2394 else
2395 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396 break;
2397
2398 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002399 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 (s[2] & 0xc0) != 0x80) {
2401 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 startinpos = s-starts;
2403 endinpos = startinpos+3;
2404 goto utf8Error;
2405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002407 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002408 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002409 startinpos = s-starts;
2410 endinpos = startinpos+3;
2411 goto utf8Error;
2412 }
2413 else
2414 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002415 break;
2416
2417 case 4:
2418 if ((s[1] & 0xc0) != 0x80 ||
2419 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002420 (s[3] & 0xc0) != 0x80) {
2421 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002422 startinpos = s-starts;
2423 endinpos = startinpos+4;
2424 goto utf8Error;
2425 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002426 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002428 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002429 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002430 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002431 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002432 UTF-16 */
2433 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002434 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002435 startinpos = s-starts;
2436 endinpos = startinpos+4;
2437 goto utf8Error;
2438 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002439#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002440 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002441#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002442 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002443
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002444 /* translate from 10000..10FFFF to 0..FFFF */
2445 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002446
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002447 /* high surrogate = top 10 bits added to D800 */
2448 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002449
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002450 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002451 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002452#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 break;
2454
2455 default:
2456 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002458 startinpos = s-starts;
2459 endinpos = startinpos+n;
2460 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 }
2462 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002463 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002464
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 utf8Error:
2466 outpos = p-PyUnicode_AS_UNICODE(unicode);
2467 if (unicode_decode_call_errorhandler(
2468 errors, &errorHandler,
2469 "utf8", errmsg,
2470 &starts, &e, &startinpos, &endinpos, &exc, &s,
2471 &unicode, &outpos, &p))
2472 goto onError;
2473 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 }
Walter Dörwald69652032004-09-07 20:24:22 +00002475 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477
2478 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002479 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 goto onError;
2481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 return (PyObject *)unicode;
2485
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 Py_XDECREF(errorHandler);
2488 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489 Py_DECREF(unicode);
2490 return NULL;
2491}
2492
Antoine Pitrouab868312009-01-10 15:40:25 +00002493#undef ASCII_CHAR_MASK
2494
2495
Tim Peters602f7402002-04-27 18:03:26 +00002496/* Allocation strategy: if the string is short, convert into a stack buffer
2497 and allocate exactly as much space needed at the end. Else allocate the
2498 maximum possible needed (4 result bytes per Unicode character), and return
2499 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002500*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002501PyObject *
2502PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002503 Py_ssize_t size,
2504 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505{
Tim Peters602f7402002-04-27 18:03:26 +00002506#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002507
Guido van Rossum98297ee2007-11-06 21:34:58 +00002508 Py_ssize_t i; /* index into s of next input byte */
2509 PyObject *result; /* result string object */
2510 char *p; /* next free byte in output buffer */
2511 Py_ssize_t nallocated; /* number of result bytes allocated */
2512 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002513 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002514 PyObject *errorHandler = NULL;
2515 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002516
Tim Peters602f7402002-04-27 18:03:26 +00002517 assert(s != NULL);
2518 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519
Tim Peters602f7402002-04-27 18:03:26 +00002520 if (size <= MAX_SHORT_UNICHARS) {
2521 /* Write into the stack buffer; nallocated can't overflow.
2522 * At the end, we'll allocate exactly as much heap space as it
2523 * turns out we need.
2524 */
2525 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002526 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002527 p = stackbuf;
2528 }
2529 else {
2530 /* Overallocate on the heap, and give the excess back at the end. */
2531 nallocated = size * 4;
2532 if (nallocated / 4 != size) /* overflow! */
2533 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002534 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002535 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002536 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002537 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002538 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002539
Tim Peters602f7402002-04-27 18:03:26 +00002540 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002541 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002542
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002543 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002544 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002546
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002548 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002549 *p++ = (char)(0xc0 | (ch >> 6));
2550 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002551 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002552#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002553 /* Special case: check for high and low surrogate */
2554 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2555 Py_UCS4 ch2 = s[i];
2556 /* Combine the two surrogates to form a UCS4 value */
2557 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2558 i++;
2559
2560 /* Encode UCS4 Unicode ordinals */
2561 *p++ = (char)(0xf0 | (ch >> 18));
2562 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002563 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2564 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002565 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002566#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002567 Py_ssize_t newpos;
2568 PyObject *rep;
2569 Py_ssize_t repsize, k;
2570 rep = unicode_encode_call_errorhandler
2571 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2572 s, size, &exc, i-1, i, &newpos);
2573 if (!rep)
2574 goto error;
2575
2576 if (PyBytes_Check(rep))
2577 repsize = PyBytes_GET_SIZE(rep);
2578 else
2579 repsize = PyUnicode_GET_SIZE(rep);
2580
2581 if (repsize > 4) {
2582 Py_ssize_t offset;
2583
2584 if (result == NULL)
2585 offset = p - stackbuf;
2586 else
2587 offset = p - PyBytes_AS_STRING(result);
2588
2589 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2590 /* integer overflow */
2591 PyErr_NoMemory();
2592 goto error;
2593 }
2594 nallocated += repsize - 4;
2595 if (result != NULL) {
2596 if (_PyBytes_Resize(&result, nallocated) < 0)
2597 goto error;
2598 } else {
2599 result = PyBytes_FromStringAndSize(NULL, nallocated);
2600 if (result == NULL)
2601 goto error;
2602 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2603 }
2604 p = PyBytes_AS_STRING(result) + offset;
2605 }
2606
2607 if (PyBytes_Check(rep)) {
2608 char *prep = PyBytes_AS_STRING(rep);
2609 for(k = repsize; k > 0; k--)
2610 *p++ = *prep++;
2611 } else /* rep is unicode */ {
2612 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2613 Py_UNICODE c;
2614
2615 for(k=0; k<repsize; k++) {
2616 c = prep[k];
2617 if (0x80 <= c) {
2618 raise_encode_exception(&exc, "utf-8", s, size,
2619 i-1, i, "surrogates not allowed");
2620 goto error;
2621 }
2622 *p++ = (char)prep[k];
2623 }
2624 }
2625 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002626#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002627 }
Victor Stinner445a6232010-04-22 20:01:57 +00002628#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002629 } else if (ch < 0x10000) {
2630 *p++ = (char)(0xe0 | (ch >> 12));
2631 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2632 *p++ = (char)(0x80 | (ch & 0x3f));
2633 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002634 /* Encode UCS4 Unicode ordinals */
2635 *p++ = (char)(0xf0 | (ch >> 18));
2636 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2637 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2638 *p++ = (char)(0x80 | (ch & 0x3f));
2639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002641
Guido van Rossum98297ee2007-11-06 21:34:58 +00002642 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002643 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002644 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002645 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002646 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002647 }
2648 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002649 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002650 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002651 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002652 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002653 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002654 Py_XDECREF(errorHandler);
2655 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002656 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002657 error:
2658 Py_XDECREF(errorHandler);
2659 Py_XDECREF(exc);
2660 Py_XDECREF(result);
2661 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002662
Tim Peters602f7402002-04-27 18:03:26 +00002663#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664}
2665
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2667{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 if (!PyUnicode_Check(unicode)) {
2669 PyErr_BadArgument();
2670 return NULL;
2671 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002672 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002673 PyUnicode_GET_SIZE(unicode),
2674 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675}
2676
Walter Dörwald41980ca2007-08-16 21:55:45 +00002677/* --- UTF-32 Codec ------------------------------------------------------- */
2678
2679PyObject *
2680PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002681 Py_ssize_t size,
2682 const char *errors,
2683 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002684{
2685 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2686}
2687
2688PyObject *
2689PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002690 Py_ssize_t size,
2691 const char *errors,
2692 int *byteorder,
2693 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002694{
2695 const char *starts = s;
2696 Py_ssize_t startinpos;
2697 Py_ssize_t endinpos;
2698 Py_ssize_t outpos;
2699 PyUnicodeObject *unicode;
2700 Py_UNICODE *p;
2701#ifndef Py_UNICODE_WIDE
2702 int i, pairs;
2703#else
2704 const int pairs = 0;
2705#endif
2706 const unsigned char *q, *e;
2707 int bo = 0; /* assume native ordering by default */
2708 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002709 /* Offsets from q for retrieving bytes in the right order. */
2710#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2711 int iorder[] = {0, 1, 2, 3};
2712#else
2713 int iorder[] = {3, 2, 1, 0};
2714#endif
2715 PyObject *errorHandler = NULL;
2716 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002717 /* On narrow builds we split characters outside the BMP into two
2718 codepoints => count how much extra space we need. */
2719#ifndef Py_UNICODE_WIDE
2720 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002721 if (((Py_UCS4 *)s)[i] >= 0x10000)
2722 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002723#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002724
2725 /* This might be one to much, because of a BOM */
2726 unicode = _PyUnicode_New((size+3)/4+pairs);
2727 if (!unicode)
2728 return NULL;
2729 if (size == 0)
2730 return (PyObject *)unicode;
2731
2732 /* Unpack UTF-32 encoded data */
2733 p = unicode->str;
2734 q = (unsigned char *)s;
2735 e = q + size;
2736
2737 if (byteorder)
2738 bo = *byteorder;
2739
2740 /* Check for BOM marks (U+FEFF) in the input and adjust current
2741 byte order setting accordingly. In native mode, the leading BOM
2742 mark is skipped, in all other modes, it is copied to the output
2743 stream as-is (giving a ZWNBSP character). */
2744 if (bo == 0) {
2745 if (size >= 4) {
2746 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002747 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002748#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 if (bom == 0x0000FEFF) {
2750 q += 4;
2751 bo = -1;
2752 }
2753 else if (bom == 0xFFFE0000) {
2754 q += 4;
2755 bo = 1;
2756 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002757#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002758 if (bom == 0x0000FEFF) {
2759 q += 4;
2760 bo = 1;
2761 }
2762 else if (bom == 0xFFFE0000) {
2763 q += 4;
2764 bo = -1;
2765 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002766#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002767 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002768 }
2769
2770 if (bo == -1) {
2771 /* force LE */
2772 iorder[0] = 0;
2773 iorder[1] = 1;
2774 iorder[2] = 2;
2775 iorder[3] = 3;
2776 }
2777 else if (bo == 1) {
2778 /* force BE */
2779 iorder[0] = 3;
2780 iorder[1] = 2;
2781 iorder[2] = 1;
2782 iorder[3] = 0;
2783 }
2784
2785 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 Py_UCS4 ch;
2787 /* remaining bytes at the end? (size should be divisible by 4) */
2788 if (e-q<4) {
2789 if (consumed)
2790 break;
2791 errmsg = "truncated data";
2792 startinpos = ((const char *)q)-starts;
2793 endinpos = ((const char *)e)-starts;
2794 goto utf32Error;
2795 /* The remaining input chars are ignored if the callback
2796 chooses to skip the input */
2797 }
2798 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2799 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002800
Benjamin Peterson29060642009-01-31 22:14:21 +00002801 if (ch >= 0x110000)
2802 {
2803 errmsg = "codepoint not in range(0x110000)";
2804 startinpos = ((const char *)q)-starts;
2805 endinpos = startinpos+4;
2806 goto utf32Error;
2807 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002808#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 if (ch >= 0x10000)
2810 {
2811 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2812 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2813 }
2814 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002815#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002816 *p++ = ch;
2817 q += 4;
2818 continue;
2819 utf32Error:
2820 outpos = p-PyUnicode_AS_UNICODE(unicode);
2821 if (unicode_decode_call_errorhandler(
2822 errors, &errorHandler,
2823 "utf32", errmsg,
2824 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2825 &unicode, &outpos, &p))
2826 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002827 }
2828
2829 if (byteorder)
2830 *byteorder = bo;
2831
2832 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002833 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002834
2835 /* Adjust length */
2836 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2837 goto onError;
2838
2839 Py_XDECREF(errorHandler);
2840 Py_XDECREF(exc);
2841 return (PyObject *)unicode;
2842
Benjamin Peterson29060642009-01-31 22:14:21 +00002843 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002844 Py_DECREF(unicode);
2845 Py_XDECREF(errorHandler);
2846 Py_XDECREF(exc);
2847 return NULL;
2848}
2849
2850PyObject *
2851PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 Py_ssize_t size,
2853 const char *errors,
2854 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002855{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002856 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002857 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002858 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002859#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002860 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002861#else
2862 const int pairs = 0;
2863#endif
2864 /* Offsets from p for storing byte pairs in the right order. */
2865#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2866 int iorder[] = {0, 1, 2, 3};
2867#else
2868 int iorder[] = {3, 2, 1, 0};
2869#endif
2870
Benjamin Peterson29060642009-01-31 22:14:21 +00002871#define STORECHAR(CH) \
2872 do { \
2873 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2874 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2875 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2876 p[iorder[0]] = (CH) & 0xff; \
2877 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002878 } while(0)
2879
2880 /* In narrow builds we can output surrogate pairs as one codepoint,
2881 so we need less space. */
2882#ifndef Py_UNICODE_WIDE
2883 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2885 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2886 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002887#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002888 nsize = (size - pairs + (byteorder == 0));
2889 bytesize = nsize * 4;
2890 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002892 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002893 if (v == NULL)
2894 return NULL;
2895
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002896 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002897 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002898 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002899 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002900 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002901
2902 if (byteorder == -1) {
2903 /* force LE */
2904 iorder[0] = 0;
2905 iorder[1] = 1;
2906 iorder[2] = 2;
2907 iorder[3] = 3;
2908 }
2909 else if (byteorder == 1) {
2910 /* force BE */
2911 iorder[0] = 3;
2912 iorder[1] = 2;
2913 iorder[2] = 1;
2914 iorder[3] = 0;
2915 }
2916
2917 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002919#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2921 Py_UCS4 ch2 = *s;
2922 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2923 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2924 s++;
2925 size--;
2926 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002928#endif
2929 STORECHAR(ch);
2930 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002931
2932 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002933 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002934#undef STORECHAR
2935}
2936
2937PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2938{
2939 if (!PyUnicode_Check(unicode)) {
2940 PyErr_BadArgument();
2941 return NULL;
2942 }
2943 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002944 PyUnicode_GET_SIZE(unicode),
2945 NULL,
2946 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002947}
2948
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949/* --- UTF-16 Codec ------------------------------------------------------- */
2950
Tim Peters772747b2001-08-09 22:21:55 +00002951PyObject *
2952PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002953 Py_ssize_t size,
2954 const char *errors,
2955 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956{
Walter Dörwald69652032004-09-07 20:24:22 +00002957 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2958}
2959
Antoine Pitrouab868312009-01-10 15:40:25 +00002960/* Two masks for fast checking of whether a C 'long' may contain
2961 UTF16-encoded surrogate characters. This is an efficient heuristic,
2962 assuming that non-surrogate characters with a code point >= 0x8000 are
2963 rare in most input.
2964 FAST_CHAR_MASK is used when the input is in native byte ordering,
2965 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002966*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002967#if (SIZEOF_LONG == 8)
2968# define FAST_CHAR_MASK 0x8000800080008000L
2969# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2970#elif (SIZEOF_LONG == 4)
2971# define FAST_CHAR_MASK 0x80008000L
2972# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2973#else
2974# error C 'long' size should be either 4 or 8!
2975#endif
2976
Walter Dörwald69652032004-09-07 20:24:22 +00002977PyObject *
2978PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002979 Py_ssize_t size,
2980 const char *errors,
2981 int *byteorder,
2982 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002983{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002985 Py_ssize_t startinpos;
2986 Py_ssize_t endinpos;
2987 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 PyUnicodeObject *unicode;
2989 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002990 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002991 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002992 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002993 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002994 /* Offsets from q for retrieving byte pairs in the right order. */
2995#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2996 int ihi = 1, ilo = 0;
2997#else
2998 int ihi = 0, ilo = 1;
2999#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 PyObject *errorHandler = NULL;
3001 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002
3003 /* Note: size will always be longer than the resulting Unicode
3004 character count */
3005 unicode = _PyUnicode_New(size);
3006 if (!unicode)
3007 return NULL;
3008 if (size == 0)
3009 return (PyObject *)unicode;
3010
3011 /* Unpack UTF-16 encoded data */
3012 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003013 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003014 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015
3016 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003017 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003019 /* Check for BOM marks (U+FEFF) in the input and adjust current
3020 byte order setting accordingly. In native mode, the leading BOM
3021 mark is skipped, in all other modes, it is copied to the output
3022 stream as-is (giving a ZWNBSP character). */
3023 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003024 if (size >= 2) {
3025 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003026#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003027 if (bom == 0xFEFF) {
3028 q += 2;
3029 bo = -1;
3030 }
3031 else if (bom == 0xFFFE) {
3032 q += 2;
3033 bo = 1;
3034 }
Tim Petersced69f82003-09-16 20:30:58 +00003035#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 if (bom == 0xFEFF) {
3037 q += 2;
3038 bo = 1;
3039 }
3040 else if (bom == 0xFFFE) {
3041 q += 2;
3042 bo = -1;
3043 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003044#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047
Tim Peters772747b2001-08-09 22:21:55 +00003048 if (bo == -1) {
3049 /* force LE */
3050 ihi = 1;
3051 ilo = 0;
3052 }
3053 else if (bo == 1) {
3054 /* force BE */
3055 ihi = 0;
3056 ilo = 1;
3057 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003058#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3059 native_ordering = ilo < ihi;
3060#else
3061 native_ordering = ilo > ihi;
3062#endif
Tim Peters772747b2001-08-09 22:21:55 +00003063
Antoine Pitrouab868312009-01-10 15:40:25 +00003064 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003065 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003066 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003067 /* First check for possible aligned read of a C 'long'. Unaligned
3068 reads are more expensive, better to defer to another iteration. */
3069 if (!((size_t) q & LONG_PTR_MASK)) {
3070 /* Fast path for runs of non-surrogate chars. */
3071 register const unsigned char *_q = q;
3072 Py_UNICODE *_p = p;
3073 if (native_ordering) {
3074 /* Native ordering is simple: as long as the input cannot
3075 possibly contain a surrogate char, do an unrolled copy
3076 of several 16-bit code points to the target object.
3077 The non-surrogate check is done on several input bytes
3078 at a time (as many as a C 'long' can contain). */
3079 while (_q < aligned_end) {
3080 unsigned long data = * (unsigned long *) _q;
3081 if (data & FAST_CHAR_MASK)
3082 break;
3083 _p[0] = ((unsigned short *) _q)[0];
3084 _p[1] = ((unsigned short *) _q)[1];
3085#if (SIZEOF_LONG == 8)
3086 _p[2] = ((unsigned short *) _q)[2];
3087 _p[3] = ((unsigned short *) _q)[3];
3088#endif
3089 _q += SIZEOF_LONG;
3090 _p += SIZEOF_LONG / 2;
3091 }
3092 }
3093 else {
3094 /* Byteswapped ordering is similar, but we must decompose
3095 the copy bytewise, and take care of zero'ing out the
3096 upper bytes if the target object is in 32-bit units
3097 (that is, in UCS-4 builds). */
3098 while (_q < aligned_end) {
3099 unsigned long data = * (unsigned long *) _q;
3100 if (data & SWAPPED_FAST_CHAR_MASK)
3101 break;
3102 /* Zero upper bytes in UCS-4 builds */
3103#if (Py_UNICODE_SIZE > 2)
3104 _p[0] = 0;
3105 _p[1] = 0;
3106#if (SIZEOF_LONG == 8)
3107 _p[2] = 0;
3108 _p[3] = 0;
3109#endif
3110#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003111 /* Issue #4916; UCS-4 builds on big endian machines must
3112 fill the two last bytes of each 4-byte unit. */
3113#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3114# define OFF 2
3115#else
3116# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003117#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003118 ((unsigned char *) _p)[OFF + 1] = _q[0];
3119 ((unsigned char *) _p)[OFF + 0] = _q[1];
3120 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3121 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3122#if (SIZEOF_LONG == 8)
3123 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3124 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3125 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3126 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3127#endif
3128#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003129 _q += SIZEOF_LONG;
3130 _p += SIZEOF_LONG / 2;
3131 }
3132 }
3133 p = _p;
3134 q = _q;
3135 if (q >= e)
3136 break;
3137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003138 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003139
Benjamin Peterson14339b62009-01-31 16:36:08 +00003140 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003141
3142 if (ch < 0xD800 || ch > 0xDFFF) {
3143 *p++ = ch;
3144 continue;
3145 }
3146
3147 /* UTF-16 code pair: */
3148 if (q > e) {
3149 errmsg = "unexpected end of data";
3150 startinpos = (((const char *)q) - 2) - starts;
3151 endinpos = ((const char *)e) + 1 - starts;
3152 goto utf16Error;
3153 }
3154 if (0xD800 <= ch && ch <= 0xDBFF) {
3155 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3156 q += 2;
3157 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003158#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003159 *p++ = ch;
3160 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003161#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003162 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003163#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 continue;
3165 }
3166 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003167 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 startinpos = (((const char *)q)-4)-starts;
3169 endinpos = startinpos+2;
3170 goto utf16Error;
3171 }
3172
Benjamin Peterson14339b62009-01-31 16:36:08 +00003173 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003174 errmsg = "illegal encoding";
3175 startinpos = (((const char *)q)-2)-starts;
3176 endinpos = startinpos+2;
3177 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003178
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 utf16Error:
3180 outpos = p - PyUnicode_AS_UNICODE(unicode);
3181 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003182 errors,
3183 &errorHandler,
3184 "utf16", errmsg,
3185 &starts,
3186 (const char **)&e,
3187 &startinpos,
3188 &endinpos,
3189 &exc,
3190 (const char **)&q,
3191 &unicode,
3192 &outpos,
3193 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003194 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003196 /* remaining byte at the end? (size should be even) */
3197 if (e == q) {
3198 if (!consumed) {
3199 errmsg = "truncated data";
3200 startinpos = ((const char *)q) - starts;
3201 endinpos = ((const char *)e) + 1 - starts;
3202 outpos = p - PyUnicode_AS_UNICODE(unicode);
3203 if (unicode_decode_call_errorhandler(
3204 errors,
3205 &errorHandler,
3206 "utf16", errmsg,
3207 &starts,
3208 (const char **)&e,
3209 &startinpos,
3210 &endinpos,
3211 &exc,
3212 (const char **)&q,
3213 &unicode,
3214 &outpos,
3215 &p))
3216 goto onError;
3217 /* The remaining input chars are ignored if the callback
3218 chooses to skip the input */
3219 }
3220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221
3222 if (byteorder)
3223 *byteorder = bo;
3224
Walter Dörwald69652032004-09-07 20:24:22 +00003225 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003227
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003229 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 goto onError;
3231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 Py_XDECREF(errorHandler);
3233 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 return (PyObject *)unicode;
3235
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 Py_XDECREF(errorHandler);
3239 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 return NULL;
3241}
3242
Antoine Pitrouab868312009-01-10 15:40:25 +00003243#undef FAST_CHAR_MASK
3244#undef SWAPPED_FAST_CHAR_MASK
3245
Tim Peters772747b2001-08-09 22:21:55 +00003246PyObject *
3247PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 Py_ssize_t size,
3249 const char *errors,
3250 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003252 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003253 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003254 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003255#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003256 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003257#else
3258 const int pairs = 0;
3259#endif
Tim Peters772747b2001-08-09 22:21:55 +00003260 /* Offsets from p for storing byte pairs in the right order. */
3261#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3262 int ihi = 1, ilo = 0;
3263#else
3264 int ihi = 0, ilo = 1;
3265#endif
3266
Benjamin Peterson29060642009-01-31 22:14:21 +00003267#define STORECHAR(CH) \
3268 do { \
3269 p[ihi] = ((CH) >> 8) & 0xff; \
3270 p[ilo] = (CH) & 0xff; \
3271 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003272 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003274#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003275 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003276 if (s[i] >= 0x10000)
3277 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003278#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003279 /* 2 * (size + pairs + (byteorder == 0)) */
3280 if (size > PY_SSIZE_T_MAX ||
3281 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003282 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003283 nsize = size + pairs + (byteorder == 0);
3284 bytesize = nsize * 2;
3285 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003286 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003287 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288 if (v == NULL)
3289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003291 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003293 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003294 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003295 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003296
3297 if (byteorder == -1) {
3298 /* force LE */
3299 ihi = 1;
3300 ilo = 0;
3301 }
3302 else if (byteorder == 1) {
3303 /* force BE */
3304 ihi = 0;
3305 ilo = 1;
3306 }
3307
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003308 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003309 Py_UNICODE ch = *s++;
3310 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003311#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 if (ch >= 0x10000) {
3313 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3314 ch = 0xD800 | ((ch-0x10000) >> 10);
3315 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003316#endif
Tim Peters772747b2001-08-09 22:21:55 +00003317 STORECHAR(ch);
3318 if (ch2)
3319 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003320 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003321
3322 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003323 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003324#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325}
3326
3327PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3328{
3329 if (!PyUnicode_Check(unicode)) {
3330 PyErr_BadArgument();
3331 return NULL;
3332 }
3333 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003334 PyUnicode_GET_SIZE(unicode),
3335 NULL,
3336 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337}
3338
3339/* --- Unicode Escape Codec ----------------------------------------------- */
3340
Fredrik Lundh06d12682001-01-24 07:59:11 +00003341static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003342
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 Py_ssize_t size,
3345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003348 Py_ssize_t startinpos;
3349 Py_ssize_t endinpos;
3350 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003351 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003353 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003355 char* message;
3356 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357 PyObject *errorHandler = NULL;
3358 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003359
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 /* Escaped strings will always be longer than the resulting
3361 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003362 length after conversion to the true value.
3363 (but if the error callback returns a long replacement string
3364 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365 v = _PyUnicode_New(size);
3366 if (v == NULL)
3367 goto onError;
3368 if (size == 0)
3369 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003373
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374 while (s < end) {
3375 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003376 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378
3379 /* Non-escape characters are interpreted as Unicode ordinals */
3380 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003381 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 continue;
3383 }
3384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386 /* \ - Escapes */
3387 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003388 c = *s++;
3389 if (s > end)
3390 c = '\0'; /* Invalid after \ */
3391 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 case '\n': break;
3395 case '\\': *p++ = '\\'; break;
3396 case '\'': *p++ = '\''; break;
3397 case '\"': *p++ = '\"'; break;
3398 case 'b': *p++ = '\b'; break;
3399 case 'f': *p++ = '\014'; break; /* FF */
3400 case 't': *p++ = '\t'; break;
3401 case 'n': *p++ = '\n'; break;
3402 case 'r': *p++ = '\r'; break;
3403 case 'v': *p++ = '\013'; break; /* VT */
3404 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3405
Benjamin Peterson29060642009-01-31 22:14:21 +00003406 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 case '0': case '1': case '2': case '3':
3408 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003409 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003410 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003411 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003412 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003413 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003415 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 break;
3417
Benjamin Peterson29060642009-01-31 22:14:21 +00003418 /* hex escapes */
3419 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003421 digits = 2;
3422 message = "truncated \\xXX escape";
3423 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003427 digits = 4;
3428 message = "truncated \\uXXXX escape";
3429 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430
Benjamin Peterson29060642009-01-31 22:14:21 +00003431 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003432 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003433 digits = 8;
3434 message = "truncated \\UXXXXXXXX escape";
3435 hexescape:
3436 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 outpos = p-PyUnicode_AS_UNICODE(v);
3438 if (s+digits>end) {
3439 endinpos = size;
3440 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003441 errors, &errorHandler,
3442 "unicodeescape", "end of string in escape sequence",
3443 &starts, &end, &startinpos, &endinpos, &exc, &s,
3444 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 goto onError;
3446 goto nextByte;
3447 }
3448 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003449 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003450 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 endinpos = (s+i+1)-starts;
3452 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003453 errors, &errorHandler,
3454 "unicodeescape", message,
3455 &starts, &end, &startinpos, &endinpos, &exc, &s,
3456 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003457 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003459 }
3460 chr = (chr<<4) & ~0xF;
3461 if (c >= '0' && c <= '9')
3462 chr += c - '0';
3463 else if (c >= 'a' && c <= 'f')
3464 chr += 10 + c - 'a';
3465 else
3466 chr += 10 + c - 'A';
3467 }
3468 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003469 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 /* _decoding_error will have already written into the
3471 target buffer. */
3472 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003473 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003474 /* when we get here, chr is a 32-bit unicode character */
3475 if (chr <= 0xffff)
3476 /* UCS-2 character */
3477 *p++ = (Py_UNICODE) chr;
3478 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003479 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003480 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003481#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003482 *p++ = chr;
3483#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003484 chr -= 0x10000L;
3485 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003486 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003487#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003488 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 endinpos = s-starts;
3490 outpos = p-PyUnicode_AS_UNICODE(v);
3491 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003492 errors, &errorHandler,
3493 "unicodeescape", "illegal Unicode character",
3494 &starts, &end, &startinpos, &endinpos, &exc, &s,
3495 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003496 goto onError;
3497 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003498 break;
3499
Benjamin Peterson29060642009-01-31 22:14:21 +00003500 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003501 case 'N':
3502 message = "malformed \\N character escape";
3503 if (ucnhash_CAPI == NULL) {
3504 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003505 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003506 if (ucnhash_CAPI == NULL)
3507 goto ucnhashError;
3508 }
3509 if (*s == '{') {
3510 const char *start = s+1;
3511 /* look for the closing brace */
3512 while (*s != '}' && s < end)
3513 s++;
3514 if (s > start && s < end && *s == '}') {
3515 /* found a name. look it up in the unicode database */
3516 message = "unknown Unicode character name";
3517 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003518 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003519 goto store;
3520 }
3521 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 endinpos = s-starts;
3523 outpos = p-PyUnicode_AS_UNICODE(v);
3524 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003525 errors, &errorHandler,
3526 "unicodeescape", message,
3527 &starts, &end, &startinpos, &endinpos, &exc, &s,
3528 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003529 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003530 break;
3531
3532 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003533 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 message = "\\ at end of string";
3535 s--;
3536 endinpos = s-starts;
3537 outpos = p-PyUnicode_AS_UNICODE(v);
3538 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 errors, &errorHandler,
3540 "unicodeescape", message,
3541 &starts, &end, &startinpos, &endinpos, &exc, &s,
3542 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003543 goto onError;
3544 }
3545 else {
3546 *p++ = '\\';
3547 *p++ = (unsigned char)s[-1];
3548 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003549 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003551 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003554 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003556 Py_XDECREF(errorHandler);
3557 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003559
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003561 PyErr_SetString(
3562 PyExc_UnicodeError,
3563 "\\N escapes not supported (can't load unicodedata module)"
3564 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003565 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 Py_XDECREF(errorHandler);
3567 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003568 return NULL;
3569
Benjamin Peterson29060642009-01-31 22:14:21 +00003570 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 Py_XDECREF(errorHandler);
3573 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 return NULL;
3575}
3576
3577/* Return a Unicode-Escape string version of the Unicode object.
3578
3579 If quotes is true, the string is enclosed in u"" or u'' quotes as
3580 appropriate.
3581
3582*/
3583
Thomas Wouters477c8d52006-05-27 19:21:47 +00003584Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003585 Py_ssize_t size,
3586 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003587{
3588 /* like wcschr, but doesn't stop at NULL characters */
3589
3590 while (size-- > 0) {
3591 if (*s == ch)
3592 return s;
3593 s++;
3594 }
3595
3596 return NULL;
3597}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003598
Walter Dörwald79e913e2007-05-12 11:08:06 +00003599static const char *hexdigits = "0123456789abcdef";
3600
3601PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003604 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003607#ifdef Py_UNICODE_WIDE
3608 const Py_ssize_t expandsize = 10;
3609#else
3610 const Py_ssize_t expandsize = 6;
3611#endif
3612
Thomas Wouters89f507f2006-12-13 04:49:30 +00003613 /* XXX(nnorwitz): rather than over-allocating, it would be
3614 better to choose a different scheme. Perhaps scan the
3615 first N-chars of the string and allocate based on that size.
3616 */
3617 /* Initial allocation is based on the longest-possible unichr
3618 escape.
3619
3620 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3621 unichr, so in this case it's the longest unichr escape. In
3622 narrow (UTF-16) builds this is five chars per source unichr
3623 since there are two unichrs in the surrogate pair, so in narrow
3624 (UTF-16) builds it's not the longest unichr escape.
3625
3626 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3627 so in the narrow (UTF-16) build case it's the longest unichr
3628 escape.
3629 */
3630
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003631 if (size == 0)
3632 return PyBytes_FromStringAndSize(NULL, 0);
3633
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003634 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003636
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003637 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 2
3639 + expandsize*size
3640 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 if (repr == NULL)
3642 return NULL;
3643
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003644 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 while (size-- > 0) {
3647 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003648
Walter Dörwald79e913e2007-05-12 11:08:06 +00003649 /* Escape backslashes */
3650 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 *p++ = '\\';
3652 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003653 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003654 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003655
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003656#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003657 /* Map 21-bit characters to '\U00xxxxxx' */
3658 else if (ch >= 0x10000) {
3659 *p++ = '\\';
3660 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003661 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3662 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3663 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3664 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3665 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3666 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3667 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3668 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003670 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003671#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3673 else if (ch >= 0xD800 && ch < 0xDC00) {
3674 Py_UNICODE ch2;
3675 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003676
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 ch2 = *s++;
3678 size--;
3679 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3680 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3681 *p++ = '\\';
3682 *p++ = 'U';
3683 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3684 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3685 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3686 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3687 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3688 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3689 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3690 *p++ = hexdigits[ucs & 0x0000000F];
3691 continue;
3692 }
3693 /* Fall through: isolated surrogates are copied as-is */
3694 s--;
3695 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003696 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003697#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003698
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003700 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 *p++ = '\\';
3702 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003703 *p++ = hexdigits[(ch >> 12) & 0x000F];
3704 *p++ = hexdigits[(ch >> 8) & 0x000F];
3705 *p++ = hexdigits[(ch >> 4) & 0x000F];
3706 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003708
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003709 /* Map special whitespace to '\t', \n', '\r' */
3710 else if (ch == '\t') {
3711 *p++ = '\\';
3712 *p++ = 't';
3713 }
3714 else if (ch == '\n') {
3715 *p++ = '\\';
3716 *p++ = 'n';
3717 }
3718 else if (ch == '\r') {
3719 *p++ = '\\';
3720 *p++ = 'r';
3721 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003722
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003723 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003724 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003726 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003727 *p++ = hexdigits[(ch >> 4) & 0x000F];
3728 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003729 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003730
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 /* Copy everything else as-is */
3732 else
3733 *p++ = (char) ch;
3734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003736 assert(p - PyBytes_AS_STRING(repr) > 0);
3737 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3738 return NULL;
3739 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740}
3741
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003742PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003744 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 if (!PyUnicode_Check(unicode)) {
3746 PyErr_BadArgument();
3747 return NULL;
3748 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003749 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3750 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003751 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752}
3753
3754/* --- Raw Unicode Escape Codec ------------------------------------------- */
3755
3756PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 Py_ssize_t size,
3758 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003761 Py_ssize_t startinpos;
3762 Py_ssize_t endinpos;
3763 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 const char *end;
3767 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 PyObject *errorHandler = NULL;
3769 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003770
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 /* Escaped strings will always be longer than the resulting
3772 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 length after conversion to the true value. (But decoding error
3774 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 v = _PyUnicode_New(size);
3776 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003777 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003779 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 end = s + size;
3782 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003783 unsigned char c;
3784 Py_UCS4 x;
3785 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003786 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787
Benjamin Peterson29060642009-01-31 22:14:21 +00003788 /* Non-escape characters are interpreted as Unicode ordinals */
3789 if (*s != '\\') {
3790 *p++ = (unsigned char)*s++;
3791 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003792 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003793 startinpos = s-starts;
3794
3795 /* \u-escapes are only interpreted iff the number of leading
3796 backslashes if odd */
3797 bs = s;
3798 for (;s < end;) {
3799 if (*s != '\\')
3800 break;
3801 *p++ = (unsigned char)*s++;
3802 }
3803 if (((s - bs) & 1) == 0 ||
3804 s >= end ||
3805 (*s != 'u' && *s != 'U')) {
3806 continue;
3807 }
3808 p--;
3809 count = *s=='u' ? 4 : 8;
3810 s++;
3811
3812 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3813 outpos = p-PyUnicode_AS_UNICODE(v);
3814 for (x = 0, i = 0; i < count; ++i, ++s) {
3815 c = (unsigned char)*s;
3816 if (!ISXDIGIT(c)) {
3817 endinpos = s-starts;
3818 if (unicode_decode_call_errorhandler(
3819 errors, &errorHandler,
3820 "rawunicodeescape", "truncated \\uXXXX",
3821 &starts, &end, &startinpos, &endinpos, &exc, &s,
3822 &v, &outpos, &p))
3823 goto onError;
3824 goto nextByte;
3825 }
3826 x = (x<<4) & ~0xF;
3827 if (c >= '0' && c <= '9')
3828 x += c - '0';
3829 else if (c >= 'a' && c <= 'f')
3830 x += 10 + c - 'a';
3831 else
3832 x += 10 + c - 'A';
3833 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003834 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003835 /* UCS-2 character */
3836 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003837 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003838 /* UCS-4 character. Either store directly, or as
3839 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003840#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003841 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003842#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 x -= 0x10000L;
3844 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3845 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003846#endif
3847 } else {
3848 endinpos = s-starts;
3849 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003850 if (unicode_decode_call_errorhandler(
3851 errors, &errorHandler,
3852 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 &starts, &end, &startinpos, &endinpos, &exc, &s,
3854 &v, &outpos, &p))
3855 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003856 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003857 nextByte:
3858 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003860 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003861 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 Py_XDECREF(errorHandler);
3863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003865
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 Py_XDECREF(errorHandler);
3869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 return NULL;
3871}
3872
3873PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003874 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003876 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877 char *p;
3878 char *q;
3879
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003880#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003881 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003882#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003883 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003884#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003885
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003886 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003887 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003888
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003889 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 if (repr == NULL)
3891 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003892 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003893 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003895 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 while (size-- > 0) {
3897 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003898#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 /* Map 32-bit characters to '\Uxxxxxxxx' */
3900 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003901 *p++ = '\\';
3902 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003903 *p++ = hexdigits[(ch >> 28) & 0xf];
3904 *p++ = hexdigits[(ch >> 24) & 0xf];
3905 *p++ = hexdigits[(ch >> 20) & 0xf];
3906 *p++ = hexdigits[(ch >> 16) & 0xf];
3907 *p++ = hexdigits[(ch >> 12) & 0xf];
3908 *p++ = hexdigits[(ch >> 8) & 0xf];
3909 *p++ = hexdigits[(ch >> 4) & 0xf];
3910 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003911 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003912 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003913#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003914 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3915 if (ch >= 0xD800 && ch < 0xDC00) {
3916 Py_UNICODE ch2;
3917 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003918
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 ch2 = *s++;
3920 size--;
3921 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3922 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3923 *p++ = '\\';
3924 *p++ = 'U';
3925 *p++ = hexdigits[(ucs >> 28) & 0xf];
3926 *p++ = hexdigits[(ucs >> 24) & 0xf];
3927 *p++ = hexdigits[(ucs >> 20) & 0xf];
3928 *p++ = hexdigits[(ucs >> 16) & 0xf];
3929 *p++ = hexdigits[(ucs >> 12) & 0xf];
3930 *p++ = hexdigits[(ucs >> 8) & 0xf];
3931 *p++ = hexdigits[(ucs >> 4) & 0xf];
3932 *p++ = hexdigits[ucs & 0xf];
3933 continue;
3934 }
3935 /* Fall through: isolated surrogates are copied as-is */
3936 s--;
3937 size++;
3938 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003939#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003940 /* Map 16-bit characters to '\uxxxx' */
3941 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 *p++ = '\\';
3943 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003944 *p++ = hexdigits[(ch >> 12) & 0xf];
3945 *p++ = hexdigits[(ch >> 8) & 0xf];
3946 *p++ = hexdigits[(ch >> 4) & 0xf];
3947 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003949 /* Copy everything else as-is */
3950 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 *p++ = (char) ch;
3952 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003953 size = p - q;
3954
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003955 assert(size > 0);
3956 if (_PyBytes_Resize(&repr, size) < 0)
3957 return NULL;
3958 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959}
3960
3961PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3962{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003963 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003965 PyErr_BadArgument();
3966 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003968 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3969 PyUnicode_GET_SIZE(unicode));
3970
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003971 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972}
3973
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003974/* --- Unicode Internal Codec ------------------------------------------- */
3975
3976PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 Py_ssize_t size,
3978 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003979{
3980 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003981 Py_ssize_t startinpos;
3982 Py_ssize_t endinpos;
3983 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003984 PyUnicodeObject *v;
3985 Py_UNICODE *p;
3986 const char *end;
3987 const char *reason;
3988 PyObject *errorHandler = NULL;
3989 PyObject *exc = NULL;
3990
Neal Norwitzd43069c2006-01-08 01:12:10 +00003991#ifdef Py_UNICODE_WIDE
3992 Py_UNICODE unimax = PyUnicode_GetMax();
3993#endif
3994
Thomas Wouters89f507f2006-12-13 04:49:30 +00003995 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003996 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3997 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003999 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004001 p = PyUnicode_AS_UNICODE(v);
4002 end = s + size;
4003
4004 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004005 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004006 /* We have to sanity check the raw data, otherwise doom looms for
4007 some malformed UCS-4 data. */
4008 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004009#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004010 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004011#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004012 end-s < Py_UNICODE_SIZE
4013 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004014 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004015 startinpos = s - starts;
4016 if (end-s < Py_UNICODE_SIZE) {
4017 endinpos = end-starts;
4018 reason = "truncated input";
4019 }
4020 else {
4021 endinpos = s - starts + Py_UNICODE_SIZE;
4022 reason = "illegal code point (> 0x10FFFF)";
4023 }
4024 outpos = p - PyUnicode_AS_UNICODE(v);
4025 if (unicode_decode_call_errorhandler(
4026 errors, &errorHandler,
4027 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004028 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004029 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004030 goto onError;
4031 }
4032 }
4033 else {
4034 p++;
4035 s += Py_UNICODE_SIZE;
4036 }
4037 }
4038
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004039 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004040 goto onError;
4041 Py_XDECREF(errorHandler);
4042 Py_XDECREF(exc);
4043 return (PyObject *)v;
4044
Benjamin Peterson29060642009-01-31 22:14:21 +00004045 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004046 Py_XDECREF(v);
4047 Py_XDECREF(errorHandler);
4048 Py_XDECREF(exc);
4049 return NULL;
4050}
4051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052/* --- Latin-1 Codec ------------------------------------------------------ */
4053
4054PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004055 Py_ssize_t size,
4056 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057{
4058 PyUnicodeObject *v;
4059 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004060 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004061
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004063 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 Py_UNICODE r = *(unsigned char*)s;
4065 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004066 }
4067
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 v = _PyUnicode_New(size);
4069 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004072 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004074 e = s + size;
4075 /* Unrolling the copy makes it much faster by reducing the looping
4076 overhead. This is similar to what many memcpy() implementations do. */
4077 unrolled_end = e - 4;
4078 while (s < unrolled_end) {
4079 p[0] = (unsigned char) s[0];
4080 p[1] = (unsigned char) s[1];
4081 p[2] = (unsigned char) s[2];
4082 p[3] = (unsigned char) s[3];
4083 s += 4;
4084 p += 4;
4085 }
4086 while (s < e)
4087 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004089
Benjamin Peterson29060642009-01-31 22:14:21 +00004090 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 Py_XDECREF(v);
4092 return NULL;
4093}
4094
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095/* create or adjust a UnicodeEncodeError */
4096static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 const char *encoding,
4098 const Py_UNICODE *unicode, Py_ssize_t size,
4099 Py_ssize_t startpos, Py_ssize_t endpos,
4100 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 *exceptionObject = PyUnicodeEncodeError_Create(
4104 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 }
4106 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4108 goto onError;
4109 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4110 goto onError;
4111 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4112 goto onError;
4113 return;
4114 onError:
4115 Py_DECREF(*exceptionObject);
4116 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 }
4118}
4119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120/* raises a UnicodeEncodeError */
4121static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 const char *encoding,
4123 const Py_UNICODE *unicode, Py_ssize_t size,
4124 Py_ssize_t startpos, Py_ssize_t endpos,
4125 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126{
4127 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131}
4132
4133/* error handling callback helper:
4134 build arguments, call the callback and check the arguments,
4135 put the result into newpos and return the replacement string, which
4136 has to be freed by the caller */
4137static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 PyObject **errorHandler,
4139 const char *encoding, const char *reason,
4140 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4141 Py_ssize_t startpos, Py_ssize_t endpos,
4142 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004144 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145
4146 PyObject *restuple;
4147 PyObject *resunicode;
4148
4149 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 }
4154
4155 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159
4160 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004165 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 Py_DECREF(restuple);
4167 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004169 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004170 &resunicode, newpos)) {
4171 Py_DECREF(restuple);
4172 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004174 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4175 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4176 Py_DECREF(restuple);
4177 return NULL;
4178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004181 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4183 Py_DECREF(restuple);
4184 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004185 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 Py_INCREF(resunicode);
4187 Py_DECREF(restuple);
4188 return resunicode;
4189}
4190
4191static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 Py_ssize_t size,
4193 const char *errors,
4194 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195{
4196 /* output object */
4197 PyObject *res;
4198 /* pointers to the beginning and end+1 of input */
4199 const Py_UNICODE *startp = p;
4200 const Py_UNICODE *endp = p + size;
4201 /* pointer to the beginning of the unencodable characters */
4202 /* const Py_UNICODE *badp = NULL; */
4203 /* pointer into the output */
4204 char *str;
4205 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004206 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004207 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4208 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 PyObject *errorHandler = NULL;
4210 PyObject *exc = NULL;
4211 /* the following variable is used for caching string comparisons
4212 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4213 int known_errorHandler = -1;
4214
4215 /* allocate enough for a simple encoding without
4216 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004217 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004218 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004219 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004221 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004222 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 ressize = size;
4224
4225 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 /* can we encode this? */
4229 if (c<limit) {
4230 /* no overflow check, because we know that the space is enough */
4231 *str++ = (char)c;
4232 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004233 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 else {
4235 Py_ssize_t unicodepos = p-startp;
4236 Py_ssize_t requiredsize;
4237 PyObject *repunicode;
4238 Py_ssize_t repsize;
4239 Py_ssize_t newpos;
4240 Py_ssize_t respos;
4241 Py_UNICODE *uni2;
4242 /* startpos for collecting unencodable chars */
4243 const Py_UNICODE *collstart = p;
4244 const Py_UNICODE *collend = p;
4245 /* find all unecodable characters */
4246 while ((collend < endp) && ((*collend)>=limit))
4247 ++collend;
4248 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4249 if (known_errorHandler==-1) {
4250 if ((errors==NULL) || (!strcmp(errors, "strict")))
4251 known_errorHandler = 1;
4252 else if (!strcmp(errors, "replace"))
4253 known_errorHandler = 2;
4254 else if (!strcmp(errors, "ignore"))
4255 known_errorHandler = 3;
4256 else if (!strcmp(errors, "xmlcharrefreplace"))
4257 known_errorHandler = 4;
4258 else
4259 known_errorHandler = 0;
4260 }
4261 switch (known_errorHandler) {
4262 case 1: /* strict */
4263 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4264 goto onError;
4265 case 2: /* replace */
4266 while (collstart++<collend)
4267 *str++ = '?'; /* fall through */
4268 case 3: /* ignore */
4269 p = collend;
4270 break;
4271 case 4: /* xmlcharrefreplace */
4272 respos = str - PyBytes_AS_STRING(res);
4273 /* determine replacement size (temporarily (mis)uses p) */
4274 for (p = collstart, repsize = 0; p < collend; ++p) {
4275 if (*p<10)
4276 repsize += 2+1+1;
4277 else if (*p<100)
4278 repsize += 2+2+1;
4279 else if (*p<1000)
4280 repsize += 2+3+1;
4281 else if (*p<10000)
4282 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004283#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 else
4285 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004286#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 else if (*p<100000)
4288 repsize += 2+5+1;
4289 else if (*p<1000000)
4290 repsize += 2+6+1;
4291 else
4292 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004293#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 }
4295 requiredsize = respos+repsize+(endp-collend);
4296 if (requiredsize > ressize) {
4297 if (requiredsize<2*ressize)
4298 requiredsize = 2*ressize;
4299 if (_PyBytes_Resize(&res, requiredsize))
4300 goto onError;
4301 str = PyBytes_AS_STRING(res) + respos;
4302 ressize = requiredsize;
4303 }
4304 /* generate replacement (temporarily (mis)uses p) */
4305 for (p = collstart; p < collend; ++p) {
4306 str += sprintf(str, "&#%d;", (int)*p);
4307 }
4308 p = collend;
4309 break;
4310 default:
4311 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4312 encoding, reason, startp, size, &exc,
4313 collstart-startp, collend-startp, &newpos);
4314 if (repunicode == NULL)
4315 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004316 if (PyBytes_Check(repunicode)) {
4317 /* Directly copy bytes result to output. */
4318 repsize = PyBytes_Size(repunicode);
4319 if (repsize > 1) {
4320 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004321 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004322 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4323 Py_DECREF(repunicode);
4324 goto onError;
4325 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004326 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004327 ressize += repsize-1;
4328 }
4329 memcpy(str, PyBytes_AsString(repunicode), repsize);
4330 str += repsize;
4331 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004332 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004333 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004334 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004335 /* need more space? (at least enough for what we
4336 have+the replacement+the rest of the string, so
4337 we won't have to check space for encodable characters) */
4338 respos = str - PyBytes_AS_STRING(res);
4339 repsize = PyUnicode_GET_SIZE(repunicode);
4340 requiredsize = respos+repsize+(endp-collend);
4341 if (requiredsize > ressize) {
4342 if (requiredsize<2*ressize)
4343 requiredsize = 2*ressize;
4344 if (_PyBytes_Resize(&res, requiredsize)) {
4345 Py_DECREF(repunicode);
4346 goto onError;
4347 }
4348 str = PyBytes_AS_STRING(res) + respos;
4349 ressize = requiredsize;
4350 }
4351 /* check if there is anything unencodable in the replacement
4352 and copy it to the output */
4353 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4354 c = *uni2;
4355 if (c >= limit) {
4356 raise_encode_exception(&exc, encoding, startp, size,
4357 unicodepos, unicodepos+1, reason);
4358 Py_DECREF(repunicode);
4359 goto onError;
4360 }
4361 *str = (char)c;
4362 }
4363 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004364 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004365 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004366 }
4367 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004368 /* Resize if we allocated to much */
4369 size = str - PyBytes_AS_STRING(res);
4370 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004371 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004372 if (_PyBytes_Resize(&res, size) < 0)
4373 goto onError;
4374 }
4375
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 Py_XDECREF(errorHandler);
4377 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004378 return res;
4379
4380 onError:
4381 Py_XDECREF(res);
4382 Py_XDECREF(errorHandler);
4383 Py_XDECREF(exc);
4384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385}
4386
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 Py_ssize_t size,
4389 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392}
4393
4394PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4395{
4396 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 PyErr_BadArgument();
4398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 }
4400 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 PyUnicode_GET_SIZE(unicode),
4402 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403}
4404
4405/* --- 7-bit ASCII Codec -------------------------------------------------- */
4406
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 Py_ssize_t size,
4409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 PyUnicodeObject *v;
4413 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004414 Py_ssize_t startinpos;
4415 Py_ssize_t endinpos;
4416 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 const char *e;
4418 PyObject *errorHandler = NULL;
4419 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004420
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004422 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 Py_UNICODE r = *(unsigned char*)s;
4424 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004425 }
Tim Petersced69f82003-09-16 20:30:58 +00004426
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 v = _PyUnicode_New(size);
4428 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 e = s + size;
4434 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 register unsigned char c = (unsigned char)*s;
4436 if (c < 128) {
4437 *p++ = c;
4438 ++s;
4439 }
4440 else {
4441 startinpos = s-starts;
4442 endinpos = startinpos + 1;
4443 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4444 if (unicode_decode_call_errorhandler(
4445 errors, &errorHandler,
4446 "ascii", "ordinal not in range(128)",
4447 &starts, &e, &startinpos, &endinpos, &exc, &s,
4448 &v, &outpos, &p))
4449 goto onError;
4450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004452 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4454 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 Py_XDECREF(errorHandler);
4456 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004458
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 Py_XDECREF(errorHandler);
4462 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 return NULL;
4464}
4465
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 Py_ssize_t size,
4468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471}
4472
4473PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4474{
4475 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 PyErr_BadArgument();
4477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 }
4479 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 PyUnicode_GET_SIZE(unicode),
4481 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482}
4483
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004484#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004485
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004486/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004487
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004488#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004489#define NEED_RETRY
4490#endif
4491
4492/* XXX This code is limited to "true" double-byte encodings, as
4493 a) it assumes an incomplete character consists of a single byte, and
4494 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004496
4497static int is_dbcs_lead_byte(const char *s, int offset)
4498{
4499 const char *curr = s + offset;
4500
4501 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 const char *prev = CharPrev(s, curr);
4503 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004504 }
4505 return 0;
4506}
4507
4508/*
4509 * Decode MBCS string into unicode object. If 'final' is set, converts
4510 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4511 */
4512static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 const char *s, /* MBCS string */
4514 int size, /* sizeof MBCS string */
4515 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004516{
4517 Py_UNICODE *p;
4518 Py_ssize_t n = 0;
4519 int usize = 0;
4520
4521 assert(size >= 0);
4522
4523 /* Skip trailing lead-byte unless 'final' is set */
4524 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004526
4527 /* First get the size of the result */
4528 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4530 if (usize == 0) {
4531 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4532 return -1;
4533 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004534 }
4535
4536 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 /* Create unicode object */
4538 *v = _PyUnicode_New(usize);
4539 if (*v == NULL)
4540 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004541 }
4542 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 /* Extend unicode object */
4544 n = PyUnicode_GET_SIZE(*v);
4545 if (_PyUnicode_Resize(v, n + usize) < 0)
4546 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004547 }
4548
4549 /* Do the conversion */
4550 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 p = PyUnicode_AS_UNICODE(*v) + n;
4552 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4553 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4554 return -1;
4555 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004556 }
4557
4558 return size;
4559}
4560
4561PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 Py_ssize_t size,
4563 const char *errors,
4564 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004565{
4566 PyUnicodeObject *v = NULL;
4567 int done;
4568
4569 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004571
4572#ifdef NEED_RETRY
4573 retry:
4574 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004576 else
4577#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004578 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004579
4580 if (done < 0) {
4581 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004583 }
4584
4585 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004587
4588#ifdef NEED_RETRY
4589 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 s += done;
4591 size -= done;
4592 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004593 }
4594#endif
4595
4596 return (PyObject *)v;
4597}
4598
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004599PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004600 Py_ssize_t size,
4601 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004602{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004603 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4604}
4605
4606/*
4607 * Convert unicode into string object (MBCS).
4608 * Returns 0 if succeed, -1 otherwise.
4609 */
4610static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 const Py_UNICODE *p, /* unicode */
4612 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004613{
4614 int mbcssize = 0;
4615 Py_ssize_t n = 0;
4616
4617 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004618
4619 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004620 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4622 if (mbcssize == 0) {
4623 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4624 return -1;
4625 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004626 }
4627
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004628 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 /* Create string object */
4630 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4631 if (*repr == NULL)
4632 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004633 }
4634 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 /* Extend string object */
4636 n = PyBytes_Size(*repr);
4637 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4638 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004639 }
4640
4641 /* Do the conversion */
4642 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 char *s = PyBytes_AS_STRING(*repr) + n;
4644 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4645 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4646 return -1;
4647 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004648 }
4649
4650 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004651}
4652
4653PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 Py_ssize_t size,
4655 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004656{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004657 PyObject *repr = NULL;
4658 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004659
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004660#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004661 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004662 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004664 else
4665#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004667
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004668 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 Py_XDECREF(repr);
4670 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004671 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004672
4673#ifdef NEED_RETRY
4674 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 p += INT_MAX;
4676 size -= INT_MAX;
4677 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004678 }
4679#endif
4680
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004681 return repr;
4682}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004683
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004684PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4685{
4686 if (!PyUnicode_Check(unicode)) {
4687 PyErr_BadArgument();
4688 return NULL;
4689 }
4690 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004691 PyUnicode_GET_SIZE(unicode),
4692 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004693}
4694
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004695#undef NEED_RETRY
4696
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004697#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004698
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699/* --- Character Mapping Codec -------------------------------------------- */
4700
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 Py_ssize_t size,
4703 PyObject *mapping,
4704 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004706 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004707 Py_ssize_t startinpos;
4708 Py_ssize_t endinpos;
4709 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 PyUnicodeObject *v;
4712 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004713 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 PyObject *errorHandler = NULL;
4715 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004716 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004717 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004718
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719 /* Default to Latin-1 */
4720 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722
4723 v = _PyUnicode_New(size);
4724 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004729 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004730 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 mapstring = PyUnicode_AS_UNICODE(mapping);
4732 maplen = PyUnicode_GET_SIZE(mapping);
4733 while (s < e) {
4734 unsigned char ch = *s;
4735 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736
Benjamin Peterson29060642009-01-31 22:14:21 +00004737 if (ch < maplen)
4738 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 if (x == 0xfffe) {
4741 /* undefined mapping */
4742 outpos = p-PyUnicode_AS_UNICODE(v);
4743 startinpos = s-starts;
4744 endinpos = startinpos+1;
4745 if (unicode_decode_call_errorhandler(
4746 errors, &errorHandler,
4747 "charmap", "character maps to <undefined>",
4748 &starts, &e, &startinpos, &endinpos, &exc, &s,
4749 &v, &outpos, &p)) {
4750 goto onError;
4751 }
4752 continue;
4753 }
4754 *p++ = x;
4755 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004756 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004757 }
4758 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 while (s < e) {
4760 unsigned char ch = *s;
4761 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004762
Benjamin Peterson29060642009-01-31 22:14:21 +00004763 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4764 w = PyLong_FromLong((long)ch);
4765 if (w == NULL)
4766 goto onError;
4767 x = PyObject_GetItem(mapping, w);
4768 Py_DECREF(w);
4769 if (x == NULL) {
4770 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4771 /* No mapping found means: mapping is undefined. */
4772 PyErr_Clear();
4773 x = Py_None;
4774 Py_INCREF(x);
4775 } else
4776 goto onError;
4777 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004778
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 /* Apply mapping */
4780 if (PyLong_Check(x)) {
4781 long value = PyLong_AS_LONG(x);
4782 if (value < 0 || value > 65535) {
4783 PyErr_SetString(PyExc_TypeError,
4784 "character mapping must be in range(65536)");
4785 Py_DECREF(x);
4786 goto onError;
4787 }
4788 *p++ = (Py_UNICODE)value;
4789 }
4790 else if (x == Py_None) {
4791 /* undefined mapping */
4792 outpos = p-PyUnicode_AS_UNICODE(v);
4793 startinpos = s-starts;
4794 endinpos = startinpos+1;
4795 if (unicode_decode_call_errorhandler(
4796 errors, &errorHandler,
4797 "charmap", "character maps to <undefined>",
4798 &starts, &e, &startinpos, &endinpos, &exc, &s,
4799 &v, &outpos, &p)) {
4800 Py_DECREF(x);
4801 goto onError;
4802 }
4803 Py_DECREF(x);
4804 continue;
4805 }
4806 else if (PyUnicode_Check(x)) {
4807 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004808
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 if (targetsize == 1)
4810 /* 1-1 mapping */
4811 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004812
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 else if (targetsize > 1) {
4814 /* 1-n mapping */
4815 if (targetsize > extrachars) {
4816 /* resize first */
4817 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4818 Py_ssize_t needed = (targetsize - extrachars) + \
4819 (targetsize << 2);
4820 extrachars += needed;
4821 /* XXX overflow detection missing */
4822 if (_PyUnicode_Resize(&v,
4823 PyUnicode_GET_SIZE(v) + needed) < 0) {
4824 Py_DECREF(x);
4825 goto onError;
4826 }
4827 p = PyUnicode_AS_UNICODE(v) + oldpos;
4828 }
4829 Py_UNICODE_COPY(p,
4830 PyUnicode_AS_UNICODE(x),
4831 targetsize);
4832 p += targetsize;
4833 extrachars -= targetsize;
4834 }
4835 /* 1-0 mapping: skip the character */
4836 }
4837 else {
4838 /* wrong return value */
4839 PyErr_SetString(PyExc_TypeError,
4840 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004841 Py_DECREF(x);
4842 goto onError;
4843 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 Py_DECREF(x);
4845 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004846 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 }
4848 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4850 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 Py_XDECREF(errorHandler);
4852 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004854
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 Py_XDECREF(errorHandler);
4857 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 Py_XDECREF(v);
4859 return NULL;
4860}
4861
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004862/* Charmap encoding: the lookup table */
4863
4864struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004865 PyObject_HEAD
4866 unsigned char level1[32];
4867 int count2, count3;
4868 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004869};
4870
4871static PyObject*
4872encoding_map_size(PyObject *obj, PyObject* args)
4873{
4874 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004875 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004877}
4878
4879static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004880 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 PyDoc_STR("Return the size (in bytes) of this object") },
4882 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004883};
4884
4885static void
4886encoding_map_dealloc(PyObject* o)
4887{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004888 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004889}
4890
4891static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004892 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 "EncodingMap", /*tp_name*/
4894 sizeof(struct encoding_map), /*tp_basicsize*/
4895 0, /*tp_itemsize*/
4896 /* methods */
4897 encoding_map_dealloc, /*tp_dealloc*/
4898 0, /*tp_print*/
4899 0, /*tp_getattr*/
4900 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004901 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 0, /*tp_repr*/
4903 0, /*tp_as_number*/
4904 0, /*tp_as_sequence*/
4905 0, /*tp_as_mapping*/
4906 0, /*tp_hash*/
4907 0, /*tp_call*/
4908 0, /*tp_str*/
4909 0, /*tp_getattro*/
4910 0, /*tp_setattro*/
4911 0, /*tp_as_buffer*/
4912 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4913 0, /*tp_doc*/
4914 0, /*tp_traverse*/
4915 0, /*tp_clear*/
4916 0, /*tp_richcompare*/
4917 0, /*tp_weaklistoffset*/
4918 0, /*tp_iter*/
4919 0, /*tp_iternext*/
4920 encoding_map_methods, /*tp_methods*/
4921 0, /*tp_members*/
4922 0, /*tp_getset*/
4923 0, /*tp_base*/
4924 0, /*tp_dict*/
4925 0, /*tp_descr_get*/
4926 0, /*tp_descr_set*/
4927 0, /*tp_dictoffset*/
4928 0, /*tp_init*/
4929 0, /*tp_alloc*/
4930 0, /*tp_new*/
4931 0, /*tp_free*/
4932 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004933};
4934
4935PyObject*
4936PyUnicode_BuildEncodingMap(PyObject* string)
4937{
4938 Py_UNICODE *decode;
4939 PyObject *result;
4940 struct encoding_map *mresult;
4941 int i;
4942 int need_dict = 0;
4943 unsigned char level1[32];
4944 unsigned char level2[512];
4945 unsigned char *mlevel1, *mlevel2, *mlevel3;
4946 int count2 = 0, count3 = 0;
4947
4948 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4949 PyErr_BadArgument();
4950 return NULL;
4951 }
4952 decode = PyUnicode_AS_UNICODE(string);
4953 memset(level1, 0xFF, sizeof level1);
4954 memset(level2, 0xFF, sizeof level2);
4955
4956 /* If there isn't a one-to-one mapping of NULL to \0,
4957 or if there are non-BMP characters, we need to use
4958 a mapping dictionary. */
4959 if (decode[0] != 0)
4960 need_dict = 1;
4961 for (i = 1; i < 256; i++) {
4962 int l1, l2;
4963 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004964#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004965 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004966#endif
4967 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004968 need_dict = 1;
4969 break;
4970 }
4971 if (decode[i] == 0xFFFE)
4972 /* unmapped character */
4973 continue;
4974 l1 = decode[i] >> 11;
4975 l2 = decode[i] >> 7;
4976 if (level1[l1] == 0xFF)
4977 level1[l1] = count2++;
4978 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004979 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004980 }
4981
4982 if (count2 >= 0xFF || count3 >= 0xFF)
4983 need_dict = 1;
4984
4985 if (need_dict) {
4986 PyObject *result = PyDict_New();
4987 PyObject *key, *value;
4988 if (!result)
4989 return NULL;
4990 for (i = 0; i < 256; i++) {
4991 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004992 key = PyLong_FromLong(decode[i]);
4993 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004994 if (!key || !value)
4995 goto failed1;
4996 if (PyDict_SetItem(result, key, value) == -1)
4997 goto failed1;
4998 Py_DECREF(key);
4999 Py_DECREF(value);
5000 }
5001 return result;
5002 failed1:
5003 Py_XDECREF(key);
5004 Py_XDECREF(value);
5005 Py_DECREF(result);
5006 return NULL;
5007 }
5008
5009 /* Create a three-level trie */
5010 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5011 16*count2 + 128*count3 - 1);
5012 if (!result)
5013 return PyErr_NoMemory();
5014 PyObject_Init(result, &EncodingMapType);
5015 mresult = (struct encoding_map*)result;
5016 mresult->count2 = count2;
5017 mresult->count3 = count3;
5018 mlevel1 = mresult->level1;
5019 mlevel2 = mresult->level23;
5020 mlevel3 = mresult->level23 + 16*count2;
5021 memcpy(mlevel1, level1, 32);
5022 memset(mlevel2, 0xFF, 16*count2);
5023 memset(mlevel3, 0, 128*count3);
5024 count3 = 0;
5025 for (i = 1; i < 256; i++) {
5026 int o1, o2, o3, i2, i3;
5027 if (decode[i] == 0xFFFE)
5028 /* unmapped character */
5029 continue;
5030 o1 = decode[i]>>11;
5031 o2 = (decode[i]>>7) & 0xF;
5032 i2 = 16*mlevel1[o1] + o2;
5033 if (mlevel2[i2] == 0xFF)
5034 mlevel2[i2] = count3++;
5035 o3 = decode[i] & 0x7F;
5036 i3 = 128*mlevel2[i2] + o3;
5037 mlevel3[i3] = i;
5038 }
5039 return result;
5040}
5041
5042static int
5043encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5044{
5045 struct encoding_map *map = (struct encoding_map*)mapping;
5046 int l1 = c>>11;
5047 int l2 = (c>>7) & 0xF;
5048 int l3 = c & 0x7F;
5049 int i;
5050
5051#ifdef Py_UNICODE_WIDE
5052 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005054 }
5055#endif
5056 if (c == 0)
5057 return 0;
5058 /* level 1*/
5059 i = map->level1[l1];
5060 if (i == 0xFF) {
5061 return -1;
5062 }
5063 /* level 2*/
5064 i = map->level23[16*i+l2];
5065 if (i == 0xFF) {
5066 return -1;
5067 }
5068 /* level 3 */
5069 i = map->level23[16*map->count2 + 128*i + l3];
5070 if (i == 0) {
5071 return -1;
5072 }
5073 return i;
5074}
5075
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076/* Lookup the character ch in the mapping. If the character
5077 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005078 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080{
Christian Heimes217cfd12007-12-02 14:31:20 +00005081 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 PyObject *x;
5083
5084 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 x = PyObject_GetItem(mapping, w);
5087 Py_DECREF(w);
5088 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5090 /* No mapping found means: mapping is undefined. */
5091 PyErr_Clear();
5092 x = Py_None;
5093 Py_INCREF(x);
5094 return x;
5095 } else
5096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005098 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005100 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 long value = PyLong_AS_LONG(x);
5102 if (value < 0 || value > 255) {
5103 PyErr_SetString(PyExc_TypeError,
5104 "character mapping must be in range(256)");
5105 Py_DECREF(x);
5106 return NULL;
5107 }
5108 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005110 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005113 /* wrong return value */
5114 PyErr_Format(PyExc_TypeError,
5115 "character mapping must return integer, bytes or None, not %.400s",
5116 x->ob_type->tp_name);
5117 Py_DECREF(x);
5118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 }
5120}
5121
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005122static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005123charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005124{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005125 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5126 /* exponentially overallocate to minimize reallocations */
5127 if (requiredsize < 2*outsize)
5128 requiredsize = 2*outsize;
5129 if (_PyBytes_Resize(outobj, requiredsize))
5130 return -1;
5131 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005132}
5133
Benjamin Peterson14339b62009-01-31 16:36:08 +00005134typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005136}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005138 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005139 space is available. Return a new reference to the object that
5140 was put in the output buffer, or Py_None, if the mapping was undefined
5141 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005142 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005143static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005144charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005146{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005147 PyObject *rep;
5148 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005149 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150
Christian Heimes90aa7642007-12-19 02:45:37 +00005151 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005152 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005154 if (res == -1)
5155 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 if (outsize<requiredsize)
5157 if (charmapencode_resize(outobj, outpos, requiredsize))
5158 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005159 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005160 outstart[(*outpos)++] = (char)res;
5161 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005162 }
5163
5164 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005165 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005167 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 Py_DECREF(rep);
5169 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005170 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 if (PyLong_Check(rep)) {
5172 Py_ssize_t requiredsize = *outpos+1;
5173 if (outsize<requiredsize)
5174 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5175 Py_DECREF(rep);
5176 return enc_EXCEPTION;
5177 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005178 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005180 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 else {
5182 const char *repchars = PyBytes_AS_STRING(rep);
5183 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5184 Py_ssize_t requiredsize = *outpos+repsize;
5185 if (outsize<requiredsize)
5186 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5187 Py_DECREF(rep);
5188 return enc_EXCEPTION;
5189 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005190 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 memcpy(outstart + *outpos, repchars, repsize);
5192 *outpos += repsize;
5193 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005194 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005195 Py_DECREF(rep);
5196 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197}
5198
5199/* handle an error in PyUnicode_EncodeCharmap
5200 Return 0 on success, -1 on error */
5201static
5202int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005205 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005206 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005207{
5208 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005209 Py_ssize_t repsize;
5210 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211 Py_UNICODE *uni2;
5212 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005213 Py_ssize_t collstartpos = *inpos;
5214 Py_ssize_t collendpos = *inpos+1;
5215 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 char *encoding = "charmap";
5217 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005218 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005219
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005220 /* find all unencodable characters */
5221 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005222 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005223 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 int res = encoding_map_lookup(p[collendpos], mapping);
5225 if (res != -1)
5226 break;
5227 ++collendpos;
5228 continue;
5229 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005230
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 rep = charmapencode_lookup(p[collendpos], mapping);
5232 if (rep==NULL)
5233 return -1;
5234 else if (rep!=Py_None) {
5235 Py_DECREF(rep);
5236 break;
5237 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005238 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005240 }
5241 /* cache callback name lookup
5242 * (if not done yet, i.e. it's the first error) */
5243 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 if ((errors==NULL) || (!strcmp(errors, "strict")))
5245 *known_errorHandler = 1;
5246 else if (!strcmp(errors, "replace"))
5247 *known_errorHandler = 2;
5248 else if (!strcmp(errors, "ignore"))
5249 *known_errorHandler = 3;
5250 else if (!strcmp(errors, "xmlcharrefreplace"))
5251 *known_errorHandler = 4;
5252 else
5253 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005254 }
5255 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005256 case 1: /* strict */
5257 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5258 return -1;
5259 case 2: /* replace */
5260 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 x = charmapencode_output('?', mapping, res, respos);
5262 if (x==enc_EXCEPTION) {
5263 return -1;
5264 }
5265 else if (x==enc_FAILED) {
5266 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5267 return -1;
5268 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269 }
5270 /* fall through */
5271 case 3: /* ignore */
5272 *inpos = collendpos;
5273 break;
5274 case 4: /* xmlcharrefreplace */
5275 /* generate replacement (temporarily (mis)uses p) */
5276 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 char buffer[2+29+1+1];
5278 char *cp;
5279 sprintf(buffer, "&#%d;", (int)p[collpos]);
5280 for (cp = buffer; *cp; ++cp) {
5281 x = charmapencode_output(*cp, mapping, res, respos);
5282 if (x==enc_EXCEPTION)
5283 return -1;
5284 else if (x==enc_FAILED) {
5285 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5286 return -1;
5287 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005288 }
5289 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005290 *inpos = collendpos;
5291 break;
5292 default:
5293 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 encoding, reason, p, size, exceptionObject,
5295 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005296 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005298 if (PyBytes_Check(repunicode)) {
5299 /* Directly copy bytes result to output. */
5300 Py_ssize_t outsize = PyBytes_Size(*res);
5301 Py_ssize_t requiredsize;
5302 repsize = PyBytes_Size(repunicode);
5303 requiredsize = *respos + repsize;
5304 if (requiredsize > outsize)
5305 /* Make room for all additional bytes. */
5306 if (charmapencode_resize(res, respos, requiredsize)) {
5307 Py_DECREF(repunicode);
5308 return -1;
5309 }
5310 memcpy(PyBytes_AsString(*res) + *respos,
5311 PyBytes_AsString(repunicode), repsize);
5312 *respos += repsize;
5313 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005314 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005315 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005316 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005317 /* generate replacement */
5318 repsize = PyUnicode_GET_SIZE(repunicode);
5319 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 x = charmapencode_output(*uni2, mapping, res, respos);
5321 if (x==enc_EXCEPTION) {
5322 return -1;
5323 }
5324 else if (x==enc_FAILED) {
5325 Py_DECREF(repunicode);
5326 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5327 return -1;
5328 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005329 }
5330 *inpos = newpos;
5331 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332 }
5333 return 0;
5334}
5335
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 Py_ssize_t size,
5338 PyObject *mapping,
5339 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005341 /* output object */
5342 PyObject *res = NULL;
5343 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005344 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005345 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005346 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 PyObject *errorHandler = NULL;
5348 PyObject *exc = NULL;
5349 /* the following variable is used for caching string comparisons
5350 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5351 * 3=ignore, 4=xmlcharrefreplace */
5352 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353
5354 /* Default to Latin-1 */
5355 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005358 /* allocate enough for a simple encoding without
5359 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005360 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361 if (res == NULL)
5362 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005363 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 /* try to encode it */
5368 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5369 if (x==enc_EXCEPTION) /* error */
5370 goto onError;
5371 if (x==enc_FAILED) { /* unencodable character */
5372 if (charmap_encoding_error(p, size, &inpos, mapping,
5373 &exc,
5374 &known_errorHandler, &errorHandler, errors,
5375 &res, &respos)) {
5376 goto onError;
5377 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005378 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 else
5380 /* done with this character => adjust input position */
5381 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005385 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005386 if (_PyBytes_Resize(&res, respos) < 0)
5387 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 Py_XDECREF(exc);
5390 Py_XDECREF(errorHandler);
5391 return res;
5392
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005394 Py_XDECREF(res);
5395 Py_XDECREF(exc);
5396 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 return NULL;
5398}
5399
5400PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402{
5403 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 PyErr_BadArgument();
5405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 }
5407 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 PyUnicode_GET_SIZE(unicode),
5409 mapping,
5410 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411}
5412
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005413/* create or adjust a UnicodeTranslateError */
5414static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 const Py_UNICODE *unicode, Py_ssize_t size,
5416 Py_ssize_t startpos, Py_ssize_t endpos,
5417 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005419 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005420 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 }
5423 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5425 goto onError;
5426 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5427 goto onError;
5428 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5429 goto onError;
5430 return;
5431 onError:
5432 Py_DECREF(*exceptionObject);
5433 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 }
5435}
5436
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437/* raises a UnicodeTranslateError */
5438static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 const Py_UNICODE *unicode, Py_ssize_t size,
5440 Py_ssize_t startpos, Py_ssize_t endpos,
5441 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442{
5443 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447}
5448
5449/* error handling callback helper:
5450 build arguments, call the callback and check the arguments,
5451 put the result into newpos and return the replacement string, which
5452 has to be freed by the caller */
5453static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 PyObject **errorHandler,
5455 const char *reason,
5456 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5457 Py_ssize_t startpos, Py_ssize_t endpos,
5458 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005460 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005461
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005462 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005463 PyObject *restuple;
5464 PyObject *resunicode;
5465
5466 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005470 }
5471
5472 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476
5477 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005481 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005482 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 Py_DECREF(restuple);
5484 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005485 }
5486 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 &resunicode, &i_newpos)) {
5488 Py_DECREF(restuple);
5489 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005490 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005491 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005493 else
5494 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005495 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5497 Py_DECREF(restuple);
5498 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005499 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005500 Py_INCREF(resunicode);
5501 Py_DECREF(restuple);
5502 return resunicode;
5503}
5504
5505/* Lookup the character ch in the mapping and put the result in result,
5506 which must be decrefed by the caller.
5507 Return 0 on success, -1 on error */
5508static
5509int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5510{
Christian Heimes217cfd12007-12-02 14:31:20 +00005511 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 PyObject *x;
5513
5514 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 x = PyObject_GetItem(mapping, w);
5517 Py_DECREF(w);
5518 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5520 /* No mapping found means: use 1:1 mapping. */
5521 PyErr_Clear();
5522 *result = NULL;
5523 return 0;
5524 } else
5525 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 }
5527 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 *result = x;
5529 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005530 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005531 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 long value = PyLong_AS_LONG(x);
5533 long max = PyUnicode_GetMax();
5534 if (value < 0 || value > max) {
5535 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005536 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 Py_DECREF(x);
5538 return -1;
5539 }
5540 *result = x;
5541 return 0;
5542 }
5543 else if (PyUnicode_Check(x)) {
5544 *result = x;
5545 return 0;
5546 }
5547 else {
5548 /* wrong return value */
5549 PyErr_SetString(PyExc_TypeError,
5550 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005551 Py_DECREF(x);
5552 return -1;
5553 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554}
5555/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 if not reallocate and adjust various state variables.
5557 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558static
Walter Dörwald4894c302003-10-24 14:25:28 +00005559int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005562 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005563 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 /* remember old output position */
5565 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5566 /* exponentially overallocate to minimize reallocations */
5567 if (requiredsize < 2 * oldsize)
5568 requiredsize = 2 * oldsize;
5569 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5570 return -1;
5571 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 }
5573 return 0;
5574}
5575/* lookup the character, put the result in the output string and adjust
5576 various state variables. Return a new reference to the object that
5577 was put in the output buffer in *result, or Py_None, if the mapping was
5578 undefined (in which case no character was written).
5579 The called must decref result.
5580 Return 0 on success, -1 on error. */
5581static
Walter Dörwald4894c302003-10-24 14:25:28 +00005582int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5584 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585{
Walter Dörwald4894c302003-10-24 14:25:28 +00005586 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 /* not found => default to 1:1 mapping */
5590 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 }
5592 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005594 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 /* no overflow check, because we know that the space is enough */
5596 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 }
5598 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5600 if (repsize==1) {
5601 /* no overflow check, because we know that the space is enough */
5602 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5603 }
5604 else if (repsize!=0) {
5605 /* more than one character */
5606 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5607 (insize - (curinp-startinp)) +
5608 repsize - 1;
5609 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5610 return -1;
5611 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5612 *outp += repsize;
5613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614 }
5615 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 return 0;
5618}
5619
5620PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 Py_ssize_t size,
5622 PyObject *mapping,
5623 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 /* output object */
5626 PyObject *res = NULL;
5627 /* pointers to the beginning and end+1 of input */
5628 const Py_UNICODE *startp = p;
5629 const Py_UNICODE *endp = p + size;
5630 /* pointer into the output */
5631 Py_UNICODE *str;
5632 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005633 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 char *reason = "character maps to <undefined>";
5635 PyObject *errorHandler = NULL;
5636 PyObject *exc = NULL;
5637 /* the following variable is used for caching string comparisons
5638 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5639 * 3=ignore, 4=xmlcharrefreplace */
5640 int known_errorHandler = -1;
5641
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 PyErr_BadArgument();
5644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646
5647 /* allocate enough for a simple 1:1 translation without
5648 replacements, if we need more, we'll resize */
5649 res = PyUnicode_FromUnicode(NULL, size);
5650 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 /* try to encode it */
5658 PyObject *x = NULL;
5659 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5660 Py_XDECREF(x);
5661 goto onError;
5662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005663 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 if (x!=Py_None) /* it worked => adjust input pointer */
5665 ++p;
5666 else { /* untranslatable character */
5667 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5668 Py_ssize_t repsize;
5669 Py_ssize_t newpos;
5670 Py_UNICODE *uni2;
5671 /* startpos for collecting untranslatable chars */
5672 const Py_UNICODE *collstart = p;
5673 const Py_UNICODE *collend = p+1;
5674 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 /* find all untranslatable characters */
5677 while (collend < endp) {
5678 if (charmaptranslate_lookup(*collend, mapping, &x))
5679 goto onError;
5680 Py_XDECREF(x);
5681 if (x!=Py_None)
5682 break;
5683 ++collend;
5684 }
5685 /* cache callback name lookup
5686 * (if not done yet, i.e. it's the first error) */
5687 if (known_errorHandler==-1) {
5688 if ((errors==NULL) || (!strcmp(errors, "strict")))
5689 known_errorHandler = 1;
5690 else if (!strcmp(errors, "replace"))
5691 known_errorHandler = 2;
5692 else if (!strcmp(errors, "ignore"))
5693 known_errorHandler = 3;
5694 else if (!strcmp(errors, "xmlcharrefreplace"))
5695 known_errorHandler = 4;
5696 else
5697 known_errorHandler = 0;
5698 }
5699 switch (known_errorHandler) {
5700 case 1: /* strict */
5701 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005702 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 case 2: /* replace */
5704 /* No need to check for space, this is a 1:1 replacement */
5705 for (coll = collstart; coll<collend; ++coll)
5706 *str++ = '?';
5707 /* fall through */
5708 case 3: /* ignore */
5709 p = collend;
5710 break;
5711 case 4: /* xmlcharrefreplace */
5712 /* generate replacement (temporarily (mis)uses p) */
5713 for (p = collstart; p < collend; ++p) {
5714 char buffer[2+29+1+1];
5715 char *cp;
5716 sprintf(buffer, "&#%d;", (int)*p);
5717 if (charmaptranslate_makespace(&res, &str,
5718 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5719 goto onError;
5720 for (cp = buffer; *cp; ++cp)
5721 *str++ = *cp;
5722 }
5723 p = collend;
5724 break;
5725 default:
5726 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5727 reason, startp, size, &exc,
5728 collstart-startp, collend-startp, &newpos);
5729 if (repunicode == NULL)
5730 goto onError;
5731 /* generate replacement */
5732 repsize = PyUnicode_GET_SIZE(repunicode);
5733 if (charmaptranslate_makespace(&res, &str,
5734 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5735 Py_DECREF(repunicode);
5736 goto onError;
5737 }
5738 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5739 *str++ = *uni2;
5740 p = startp + newpos;
5741 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005743 }
5744 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005745 /* Resize if we allocated to much */
5746 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005747 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 if (PyUnicode_Resize(&res, respos) < 0)
5749 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 }
5751 Py_XDECREF(exc);
5752 Py_XDECREF(errorHandler);
5753 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 Py_XDECREF(res);
5757 Py_XDECREF(exc);
5758 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 return NULL;
5760}
5761
5762PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 PyObject *mapping,
5764 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765{
5766 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005767
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 str = PyUnicode_FromObject(str);
5769 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 PyUnicode_GET_SIZE(str),
5773 mapping,
5774 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 Py_DECREF(str);
5776 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005777
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 Py_XDECREF(str);
5780 return NULL;
5781}
Tim Petersced69f82003-09-16 20:30:58 +00005782
Guido van Rossum9e896b32000-04-05 20:11:21 +00005783/* --- Decimal Encoder ---------------------------------------------------- */
5784
5785int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 Py_ssize_t length,
5787 char *output,
5788 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005789{
5790 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 PyObject *errorHandler = NULL;
5792 PyObject *exc = NULL;
5793 const char *encoding = "decimal";
5794 const char *reason = "invalid decimal Unicode string";
5795 /* the following variable is used for caching string comparisons
5796 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5797 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005798
5799 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 PyErr_BadArgument();
5801 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005802 }
5803
5804 p = s;
5805 end = s + length;
5806 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 register Py_UNICODE ch = *p;
5808 int decimal;
5809 PyObject *repunicode;
5810 Py_ssize_t repsize;
5811 Py_ssize_t newpos;
5812 Py_UNICODE *uni2;
5813 Py_UNICODE *collstart;
5814 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005815
Benjamin Peterson29060642009-01-31 22:14:21 +00005816 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005817 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 ++p;
5819 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005820 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 decimal = Py_UNICODE_TODECIMAL(ch);
5822 if (decimal >= 0) {
5823 *output++ = '0' + decimal;
5824 ++p;
5825 continue;
5826 }
5827 if (0 < ch && ch < 256) {
5828 *output++ = (char)ch;
5829 ++p;
5830 continue;
5831 }
5832 /* All other characters are considered unencodable */
5833 collstart = p;
5834 collend = p+1;
5835 while (collend < end) {
5836 if ((0 < *collend && *collend < 256) ||
5837 !Py_UNICODE_ISSPACE(*collend) ||
5838 Py_UNICODE_TODECIMAL(*collend))
5839 break;
5840 }
5841 /* cache callback name lookup
5842 * (if not done yet, i.e. it's the first error) */
5843 if (known_errorHandler==-1) {
5844 if ((errors==NULL) || (!strcmp(errors, "strict")))
5845 known_errorHandler = 1;
5846 else if (!strcmp(errors, "replace"))
5847 known_errorHandler = 2;
5848 else if (!strcmp(errors, "ignore"))
5849 known_errorHandler = 3;
5850 else if (!strcmp(errors, "xmlcharrefreplace"))
5851 known_errorHandler = 4;
5852 else
5853 known_errorHandler = 0;
5854 }
5855 switch (known_errorHandler) {
5856 case 1: /* strict */
5857 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5858 goto onError;
5859 case 2: /* replace */
5860 for (p = collstart; p < collend; ++p)
5861 *output++ = '?';
5862 /* fall through */
5863 case 3: /* ignore */
5864 p = collend;
5865 break;
5866 case 4: /* xmlcharrefreplace */
5867 /* generate replacement (temporarily (mis)uses p) */
5868 for (p = collstart; p < collend; ++p)
5869 output += sprintf(output, "&#%d;", (int)*p);
5870 p = collend;
5871 break;
5872 default:
5873 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5874 encoding, reason, s, length, &exc,
5875 collstart-s, collend-s, &newpos);
5876 if (repunicode == NULL)
5877 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005878 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005879 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005880 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5881 Py_DECREF(repunicode);
5882 goto onError;
5883 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 /* generate replacement */
5885 repsize = PyUnicode_GET_SIZE(repunicode);
5886 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5887 Py_UNICODE ch = *uni2;
5888 if (Py_UNICODE_ISSPACE(ch))
5889 *output++ = ' ';
5890 else {
5891 decimal = Py_UNICODE_TODECIMAL(ch);
5892 if (decimal >= 0)
5893 *output++ = '0' + decimal;
5894 else if (0 < ch && ch < 256)
5895 *output++ = (char)ch;
5896 else {
5897 Py_DECREF(repunicode);
5898 raise_encode_exception(&exc, encoding,
5899 s, length, collstart-s, collend-s, reason);
5900 goto onError;
5901 }
5902 }
5903 }
5904 p = s + newpos;
5905 Py_DECREF(repunicode);
5906 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005907 }
5908 /* 0-terminate the output string */
5909 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 Py_XDECREF(exc);
5911 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005912 return 0;
5913
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915 Py_XDECREF(exc);
5916 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005917 return -1;
5918}
5919
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920/* --- Helpers ------------------------------------------------------------ */
5921
Eric Smith8c663262007-08-25 02:26:07 +00005922#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005923#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005924
Thomas Wouters477c8d52006-05-27 19:21:47 +00005925#include "stringlib/count.h"
5926#include "stringlib/find.h"
5927#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005928#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005929
Eric Smith5807c412008-05-11 21:00:57 +00005930#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005931#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005932#include "stringlib/localeutil.h"
5933
Thomas Wouters477c8d52006-05-27 19:21:47 +00005934/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005935#define ADJUST_INDICES(start, end, len) \
5936 if (end > len) \
5937 end = len; \
5938 else if (end < 0) { \
5939 end += len; \
5940 if (end < 0) \
5941 end = 0; \
5942 } \
5943 if (start < 0) { \
5944 start += len; \
5945 if (start < 0) \
5946 start = 0; \
5947 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005948
Martin v. Löwis18e16552006-02-15 17:27:45 +00005949Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005950 PyObject *substr,
5951 Py_ssize_t start,
5952 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005954 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005955 PyUnicodeObject* str_obj;
5956 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005957
Thomas Wouters477c8d52006-05-27 19:21:47 +00005958 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5959 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005961 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5962 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 Py_DECREF(str_obj);
5964 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 }
Tim Petersced69f82003-09-16 20:30:58 +00005966
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005967 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005968 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005969 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5970 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005971 );
5972
5973 Py_DECREF(sub_obj);
5974 Py_DECREF(str_obj);
5975
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 return result;
5977}
5978
Martin v. Löwis18e16552006-02-15 17:27:45 +00005979Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005980 PyObject *sub,
5981 Py_ssize_t start,
5982 Py_ssize_t end,
5983 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005985 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005988 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005990 sub = PyUnicode_FromObject(sub);
5991 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 Py_DECREF(str);
5993 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 }
Tim Petersced69f82003-09-16 20:30:58 +00005995
Thomas Wouters477c8d52006-05-27 19:21:47 +00005996 if (direction > 0)
5997 result = stringlib_find_slice(
5998 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5999 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6000 start, end
6001 );
6002 else
6003 result = stringlib_rfind_slice(
6004 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6005 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6006 start, end
6007 );
6008
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006010 Py_DECREF(sub);
6011
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 return result;
6013}
6014
Tim Petersced69f82003-09-16 20:30:58 +00006015static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 PyUnicodeObject *substring,
6018 Py_ssize_t start,
6019 Py_ssize_t end,
6020 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 if (substring->length == 0)
6023 return 1;
6024
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006025 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 end -= substring->length;
6027 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
6030 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 if (Py_UNICODE_MATCH(self, end, substring))
6032 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 } else {
6034 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 }
6037
6038 return 0;
6039}
6040
Martin v. Löwis18e16552006-02-15 17:27:45 +00006041Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 PyObject *substr,
6043 Py_ssize_t start,
6044 Py_ssize_t end,
6045 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006047 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006048
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 str = PyUnicode_FromObject(str);
6050 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 substr = PyUnicode_FromObject(substr);
6053 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 Py_DECREF(str);
6055 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 }
Tim Petersced69f82003-09-16 20:30:58 +00006057
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 (PyUnicodeObject *)substr,
6060 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 Py_DECREF(str);
6062 Py_DECREF(substr);
6063 return result;
6064}
6065
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066/* Apply fixfct filter to the Unicode object self and return a
6067 reference to the modified object */
6068
Tim Petersced69f82003-09-16 20:30:58 +00006069static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072{
6073
6074 PyUnicodeObject *u;
6075
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006076 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006079
6080 Py_UNICODE_COPY(u->str, self->str, self->length);
6081
Tim Peters7a29bd52001-09-12 03:03:31 +00006082 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 /* fixfct should return TRUE if it modified the buffer. If
6084 FALSE, return a reference to the original buffer instead
6085 (to save space, not time) */
6086 Py_INCREF(self);
6087 Py_DECREF(u);
6088 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 }
6090 return (PyObject*) u;
6091}
6092
Tim Petersced69f82003-09-16 20:30:58 +00006093static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094int fixupper(PyUnicodeObject *self)
6095{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006096 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 Py_UNICODE *s = self->str;
6098 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006099
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006102
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 ch = Py_UNICODE_TOUPPER(*s);
6104 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 *s = ch;
6107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 s++;
6109 }
6110
6111 return status;
6112}
6113
Tim Petersced69f82003-09-16 20:30:58 +00006114static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115int fixlower(PyUnicodeObject *self)
6116{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006117 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 Py_UNICODE *s = self->str;
6119 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006120
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 ch = Py_UNICODE_TOLOWER(*s);
6125 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 *s = ch;
6128 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 s++;
6130 }
6131
6132 return status;
6133}
6134
Tim Petersced69f82003-09-16 20:30:58 +00006135static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136int fixswapcase(PyUnicodeObject *self)
6137{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006138 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 Py_UNICODE *s = self->str;
6140 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006141
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 while (len-- > 0) {
6143 if (Py_UNICODE_ISUPPER(*s)) {
6144 *s = Py_UNICODE_TOLOWER(*s);
6145 status = 1;
6146 } else if (Py_UNICODE_ISLOWER(*s)) {
6147 *s = Py_UNICODE_TOUPPER(*s);
6148 status = 1;
6149 }
6150 s++;
6151 }
6152
6153 return status;
6154}
6155
Tim Petersced69f82003-09-16 20:30:58 +00006156static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157int fixcapitalize(PyUnicodeObject *self)
6158{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006159 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006160 Py_UNICODE *s = self->str;
6161 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006162
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006163 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006165 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 *s = Py_UNICODE_TOUPPER(*s);
6167 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006169 s++;
6170 while (--len > 0) {
6171 if (Py_UNICODE_ISUPPER(*s)) {
6172 *s = Py_UNICODE_TOLOWER(*s);
6173 status = 1;
6174 }
6175 s++;
6176 }
6177 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178}
6179
6180static
6181int fixtitle(PyUnicodeObject *self)
6182{
6183 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6184 register Py_UNICODE *e;
6185 int previous_is_cased;
6186
6187 /* Shortcut for single character strings */
6188 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6190 if (*p != ch) {
6191 *p = ch;
6192 return 1;
6193 }
6194 else
6195 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 }
Tim Petersced69f82003-09-16 20:30:58 +00006197
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 e = p + PyUnicode_GET_SIZE(self);
6199 previous_is_cased = 0;
6200 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006202
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 if (previous_is_cased)
6204 *p = Py_UNICODE_TOLOWER(ch);
6205 else
6206 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006207
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 if (Py_UNICODE_ISLOWER(ch) ||
6209 Py_UNICODE_ISUPPER(ch) ||
6210 Py_UNICODE_ISTITLE(ch))
6211 previous_is_cased = 1;
6212 else
6213 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 }
6215 return 1;
6216}
6217
Tim Peters8ce9f162004-08-27 01:49:32 +00006218PyObject *
6219PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220{
Skip Montanaro6543b452004-09-16 03:28:13 +00006221 const Py_UNICODE blank = ' ';
6222 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006223 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006224 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006225 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6226 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006227 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6228 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006229 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006230 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231
Tim Peters05eba1f2004-08-27 21:32:02 +00006232 fseq = PySequence_Fast(seq, "");
6233 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006234 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006235 }
6236
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006237 /* NOTE: the following code can't call back into Python code,
6238 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006239 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006240
Tim Peters05eba1f2004-08-27 21:32:02 +00006241 seqlen = PySequence_Fast_GET_SIZE(fseq);
6242 /* If empty sequence, return u"". */
6243 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006244 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6245 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006246 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006247 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006248 /* If singleton sequence with an exact Unicode, return that. */
6249 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 item = items[0];
6251 if (PyUnicode_CheckExact(item)) {
6252 Py_INCREF(item);
6253 res = (PyUnicodeObject *)item;
6254 goto Done;
6255 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006256 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006257 else {
6258 /* Set up sep and seplen */
6259 if (separator == NULL) {
6260 sep = &blank;
6261 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006262 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006263 else {
6264 if (!PyUnicode_Check(separator)) {
6265 PyErr_Format(PyExc_TypeError,
6266 "separator: expected str instance,"
6267 " %.80s found",
6268 Py_TYPE(separator)->tp_name);
6269 goto onError;
6270 }
6271 sep = PyUnicode_AS_UNICODE(separator);
6272 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006273 }
6274 }
6275
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006276 /* There are at least two things to join, or else we have a subclass
6277 * of str in the sequence.
6278 * Do a pre-pass to figure out the total amount of space we'll
6279 * need (sz), and see whether all argument are strings.
6280 */
6281 sz = 0;
6282 for (i = 0; i < seqlen; i++) {
6283 const Py_ssize_t old_sz = sz;
6284 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 if (!PyUnicode_Check(item)) {
6286 PyErr_Format(PyExc_TypeError,
6287 "sequence item %zd: expected str instance,"
6288 " %.80s found",
6289 i, Py_TYPE(item)->tp_name);
6290 goto onError;
6291 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006292 sz += PyUnicode_GET_SIZE(item);
6293 if (i != 0)
6294 sz += seplen;
6295 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6296 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006298 goto onError;
6299 }
6300 }
Tim Petersced69f82003-09-16 20:30:58 +00006301
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006302 res = _PyUnicode_New(sz);
6303 if (res == NULL)
6304 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006305
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006306 /* Catenate everything. */
6307 res_p = PyUnicode_AS_UNICODE(res);
6308 for (i = 0; i < seqlen; ++i) {
6309 Py_ssize_t itemlen;
6310 item = items[i];
6311 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 /* Copy item, and maybe the separator. */
6313 if (i) {
6314 Py_UNICODE_COPY(res_p, sep, seplen);
6315 res_p += seplen;
6316 }
6317 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6318 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006319 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006322 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 return (PyObject *)res;
6324
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006326 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006327 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 return NULL;
6329}
6330
Tim Petersced69f82003-09-16 20:30:58 +00006331static
6332PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 Py_ssize_t left,
6334 Py_ssize_t right,
6335 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336{
6337 PyUnicodeObject *u;
6338
6339 if (left < 0)
6340 left = 0;
6341 if (right < 0)
6342 right = 0;
6343
Tim Peters7a29bd52001-09-12 03:03:31 +00006344 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 Py_INCREF(self);
6346 return self;
6347 }
6348
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006349 if (left > PY_SSIZE_T_MAX - self->length ||
6350 right > PY_SSIZE_T_MAX - (left + self->length)) {
6351 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6352 return NULL;
6353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 u = _PyUnicode_New(left + self->length + right);
6355 if (u) {
6356 if (left)
6357 Py_UNICODE_FILL(u->str, fill, left);
6358 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6359 if (right)
6360 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6361 }
6362
6363 return u;
6364}
6365
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006366PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369
6370 string = PyUnicode_FromObject(string);
6371 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006374 list = stringlib_splitlines(
6375 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6376 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377
6378 Py_DECREF(string);
6379 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380}
6381
Tim Petersced69f82003-09-16 20:30:58 +00006382static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 PyUnicodeObject *substring,
6385 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006388 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006391 return stringlib_split_whitespace(
6392 (PyObject*) self, self->str, self->length, maxcount
6393 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006395 return stringlib_split(
6396 (PyObject*) self, self->str, self->length,
6397 substring->str, substring->length,
6398 maxcount
6399 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400}
6401
Tim Petersced69f82003-09-16 20:30:58 +00006402static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006403PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 PyUnicodeObject *substring,
6405 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006406{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006407 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006408 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006409
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006410 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006411 return stringlib_rsplit_whitespace(
6412 (PyObject*) self, self->str, self->length, maxcount
6413 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006414
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006415 return stringlib_rsplit(
6416 (PyObject*) self, self->str, self->length,
6417 substring->str, substring->length,
6418 maxcount
6419 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006420}
6421
6422static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 PyUnicodeObject *str1,
6425 PyUnicodeObject *str2,
6426 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
6428 PyUnicodeObject *u;
6429
6430 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006432 else if (maxcount == 0 || self->length == 0)
6433 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434
Thomas Wouters477c8d52006-05-27 19:21:47 +00006435 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006436 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006437 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006438 if (str1->length == 0)
6439 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006440 if (str1->length == 1) {
6441 /* replace characters */
6442 Py_UNICODE u1, u2;
6443 if (!findchar(self->str, self->length, str1->str[0]))
6444 goto nothing;
6445 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6446 if (!u)
6447 return NULL;
6448 Py_UNICODE_COPY(u->str, self->str, self->length);
6449 u1 = str1->str[0];
6450 u2 = str2->str[0];
6451 for (i = 0; i < u->length; i++)
6452 if (u->str[i] == u1) {
6453 if (--maxcount < 0)
6454 break;
6455 u->str[i] = u2;
6456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006458 i = stringlib_find(
6459 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006461 if (i < 0)
6462 goto nothing;
6463 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6464 if (!u)
6465 return NULL;
6466 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006467
6468 /* change everything in-place, starting with this one */
6469 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6470 i += str1->length;
6471
6472 while ( --maxcount > 0) {
6473 i = stringlib_find(self->str+i, self->length-i,
6474 str1->str, str1->length,
6475 i);
6476 if (i == -1)
6477 break;
6478 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6479 i += str1->length;
6480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006483
6484 Py_ssize_t n, i, j, e;
6485 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 Py_UNICODE *p;
6487
6488 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006489 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6490 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006491 if (n == 0)
6492 goto nothing;
6493 /* new_size = self->length + n * (str2->length - str1->length)); */
6494 delta = (str2->length - str1->length);
6495 if (delta == 0) {
6496 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006498 product = n * (str2->length - str1->length);
6499 if ((product / (str2->length - str1->length)) != n) {
6500 PyErr_SetString(PyExc_OverflowError,
6501 "replace string is too long");
6502 return NULL;
6503 }
6504 new_size = self->length + product;
6505 if (new_size < 0) {
6506 PyErr_SetString(PyExc_OverflowError,
6507 "replace string is too long");
6508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 }
6510 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006511 u = _PyUnicode_New(new_size);
6512 if (!u)
6513 return NULL;
6514 i = 0;
6515 p = u->str;
6516 e = self->length - str1->length;
6517 if (str1->length > 0) {
6518 while (n-- > 0) {
6519 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006520 j = stringlib_find(self->str+i, self->length-i,
6521 str1->str, str1->length,
6522 i);
6523 if (j == -1)
6524 break;
6525 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006526 /* copy unchanged part [i:j] */
6527 Py_UNICODE_COPY(p, self->str+i, j-i);
6528 p += j - i;
6529 }
6530 /* copy substitution string */
6531 if (str2->length > 0) {
6532 Py_UNICODE_COPY(p, str2->str, str2->length);
6533 p += str2->length;
6534 }
6535 i = j + str1->length;
6536 }
6537 if (i < self->length)
6538 /* copy tail [i:] */
6539 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6540 } else {
6541 /* interleave */
6542 while (n > 0) {
6543 Py_UNICODE_COPY(p, str2->str, str2->length);
6544 p += str2->length;
6545 if (--n <= 0)
6546 break;
6547 *p++ = self->str[i++];
6548 }
6549 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006553
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006555 /* nothing to replace; return original string (when possible) */
6556 if (PyUnicode_CheckExact(self)) {
6557 Py_INCREF(self);
6558 return (PyObject *) self;
6559 }
6560 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561}
6562
6563/* --- Unicode Object Methods --------------------------------------------- */
6564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006565PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567\n\
6568Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006569characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570
6571static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006572unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 return fixup(self, fixtitle);
6575}
6576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006577PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579\n\
6580Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006581have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
6583static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006584unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 return fixup(self, fixcapitalize);
6587}
6588
6589#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006590PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592\n\
6593Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006594normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
6596static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006597unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598{
6599 PyObject *list;
6600 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006601 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 /* Split into words */
6604 list = split(self, NULL, -1);
6605 if (!list)
6606 return NULL;
6607
6608 /* Capitalize each word */
6609 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6610 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 if (item == NULL)
6613 goto onError;
6614 Py_DECREF(PyList_GET_ITEM(list, i));
6615 PyList_SET_ITEM(list, i, item);
6616 }
6617
6618 /* Join the words to form a new string */
6619 item = PyUnicode_Join(NULL, list);
6620
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 Py_DECREF(list);
6623 return (PyObject *)item;
6624}
6625#endif
6626
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006627/* Argument converter. Coerces to a single unicode character */
6628
6629static int
6630convert_uc(PyObject *obj, void *addr)
6631{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006632 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6633 PyObject *uniobj;
6634 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006635
Benjamin Peterson14339b62009-01-31 16:36:08 +00006636 uniobj = PyUnicode_FromObject(obj);
6637 if (uniobj == NULL) {
6638 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006640 return 0;
6641 }
6642 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6643 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006644 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006645 Py_DECREF(uniobj);
6646 return 0;
6647 }
6648 unistr = PyUnicode_AS_UNICODE(uniobj);
6649 *fillcharloc = unistr[0];
6650 Py_DECREF(uniobj);
6651 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006652}
6653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006654PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006657Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006658done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659
6660static PyObject *
6661unicode_center(PyUnicodeObject *self, PyObject *args)
6662{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006663 Py_ssize_t marg, left;
6664 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006665 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666
Thomas Woutersde017742006-02-16 19:34:37 +00006667 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 return NULL;
6669
Tim Peters7a29bd52001-09-12 03:03:31 +00006670 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 Py_INCREF(self);
6672 return (PyObject*) self;
6673 }
6674
6675 marg = width - self->length;
6676 left = marg / 2 + (marg & width & 1);
6677
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006678 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679}
6680
Marc-André Lemburge5034372000-08-08 08:04:29 +00006681#if 0
6682
6683/* This code should go into some future Unicode collation support
6684 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006685 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006686
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006687/* speedy UTF-16 code point order comparison */
6688/* gleaned from: */
6689/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6690
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006691static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006692{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006693 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006694 0, 0, 0, 0, 0, 0, 0, 0,
6695 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006696 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006697};
6698
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699static int
6700unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6701{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006702 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 Py_UNICODE *s1 = str1->str;
6705 Py_UNICODE *s2 = str2->str;
6706
6707 len1 = str1->length;
6708 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006709
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006711 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006712
6713 c1 = *s1++;
6714 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006715
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 if (c1 > (1<<11) * 26)
6717 c1 += utf16Fixup[c1>>11];
6718 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006719 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006720 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006721
6722 if (c1 != c2)
6723 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006724
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006725 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 }
6727
6728 return (len1 < len2) ? -1 : (len1 != len2);
6729}
6730
Marc-André Lemburge5034372000-08-08 08:04:29 +00006731#else
6732
6733static int
6734unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6735{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006736 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006737
6738 Py_UNICODE *s1 = str1->str;
6739 Py_UNICODE *s2 = str2->str;
6740
6741 len1 = str1->length;
6742 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006743
Marc-André Lemburge5034372000-08-08 08:04:29 +00006744 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006745 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006746
Fredrik Lundh45714e92001-06-26 16:39:36 +00006747 c1 = *s1++;
6748 c2 = *s2++;
6749
6750 if (c1 != c2)
6751 return (c1 < c2) ? -1 : 1;
6752
Marc-André Lemburge5034372000-08-08 08:04:29 +00006753 len1--; len2--;
6754 }
6755
6756 return (len1 < len2) ? -1 : (len1 != len2);
6757}
6758
6759#endif
6760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006764 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6765 return unicode_compare((PyUnicodeObject *)left,
6766 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006767 PyErr_Format(PyExc_TypeError,
6768 "Can't compare %.100s and %.100s",
6769 left->ob_type->tp_name,
6770 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 return -1;
6772}
6773
Martin v. Löwis5b222132007-06-10 09:51:05 +00006774int
6775PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6776{
6777 int i;
6778 Py_UNICODE *id;
6779 assert(PyUnicode_Check(uni));
6780 id = PyUnicode_AS_UNICODE(uni);
6781 /* Compare Unicode string and source character set string */
6782 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 if (id[i] != str[i])
6784 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006785 /* This check keeps Python strings that end in '\0' from comparing equal
6786 to C strings identical up to that point. */
6787 if (PyUnicode_GET_SIZE(uni) != i)
6788 /* We'll say the Python string is longer. */
6789 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006790 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006792 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006794 return 0;
6795}
6796
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006797
Benjamin Peterson29060642009-01-31 22:14:21 +00006798#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006799 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006800
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006801PyObject *PyUnicode_RichCompare(PyObject *left,
6802 PyObject *right,
6803 int op)
6804{
6805 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006806
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006807 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6808 PyObject *v;
6809 if (((PyUnicodeObject *) left)->length !=
6810 ((PyUnicodeObject *) right)->length) {
6811 if (op == Py_EQ) {
6812 Py_INCREF(Py_False);
6813 return Py_False;
6814 }
6815 if (op == Py_NE) {
6816 Py_INCREF(Py_True);
6817 return Py_True;
6818 }
6819 }
6820 if (left == right)
6821 result = 0;
6822 else
6823 result = unicode_compare((PyUnicodeObject *)left,
6824 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006825
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006826 /* Convert the return value to a Boolean */
6827 switch (op) {
6828 case Py_EQ:
6829 v = TEST_COND(result == 0);
6830 break;
6831 case Py_NE:
6832 v = TEST_COND(result != 0);
6833 break;
6834 case Py_LE:
6835 v = TEST_COND(result <= 0);
6836 break;
6837 case Py_GE:
6838 v = TEST_COND(result >= 0);
6839 break;
6840 case Py_LT:
6841 v = TEST_COND(result == -1);
6842 break;
6843 case Py_GT:
6844 v = TEST_COND(result == 1);
6845 break;
6846 default:
6847 PyErr_BadArgument();
6848 return NULL;
6849 }
6850 Py_INCREF(v);
6851 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006852 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006853
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006854 Py_INCREF(Py_NotImplemented);
6855 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006856}
6857
Guido van Rossum403d68b2000-03-13 15:55:09 +00006858int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006860{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006861 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006862 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006863
6864 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006865 sub = PyUnicode_FromObject(element);
6866 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 PyErr_Format(PyExc_TypeError,
6868 "'in <string>' requires string as left operand, not %s",
6869 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006870 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006871 }
6872
Thomas Wouters477c8d52006-05-27 19:21:47 +00006873 str = PyUnicode_FromObject(container);
6874 if (!str) {
6875 Py_DECREF(sub);
6876 return -1;
6877 }
6878
6879 result = stringlib_contains_obj(str, sub);
6880
6881 Py_DECREF(str);
6882 Py_DECREF(sub);
6883
Guido van Rossum403d68b2000-03-13 15:55:09 +00006884 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006885}
6886
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887/* Concat to string or Unicode object giving a new Unicode object. */
6888
6889PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891{
6892 PyUnicodeObject *u = NULL, *v = NULL, *w;
6893
6894 /* Coerce the two arguments */
6895 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6896 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6899 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
6902 /* Shortcuts */
6903 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 Py_DECREF(v);
6905 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 }
6907 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 Py_DECREF(u);
6909 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 }
6911
6912 /* Concat the two Unicode strings */
6913 w = _PyUnicode_New(u->length + v->length);
6914 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 Py_UNICODE_COPY(w->str, u->str, u->length);
6917 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6918
6919 Py_DECREF(u);
6920 Py_DECREF(v);
6921 return (PyObject *)w;
6922
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 Py_XDECREF(u);
6925 Py_XDECREF(v);
6926 return NULL;
6927}
6928
Walter Dörwald1ab83302007-05-18 17:15:44 +00006929void
6930PyUnicode_Append(PyObject **pleft, PyObject *right)
6931{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006932 PyObject *new;
6933 if (*pleft == NULL)
6934 return;
6935 if (right == NULL || !PyUnicode_Check(*pleft)) {
6936 Py_DECREF(*pleft);
6937 *pleft = NULL;
6938 return;
6939 }
6940 new = PyUnicode_Concat(*pleft, right);
6941 Py_DECREF(*pleft);
6942 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006943}
6944
6945void
6946PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6947{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006948 PyUnicode_Append(pleft, right);
6949 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006950}
6951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006952PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006955Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006956string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006957interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958
6959static PyObject *
6960unicode_count(PyUnicodeObject *self, PyObject *args)
6961{
6962 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006963 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006964 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 PyObject *result;
6966
Guido van Rossumb8872e62000-05-09 14:14:27 +00006967 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 return NULL;
6970
6971 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006972 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006975
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006976 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006977 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006978 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006979 substring->str, substring->length,
6980 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00006981 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982
6983 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006984
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 return result;
6986}
6987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006988PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006991Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006992to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006993handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6995'xmlcharrefreplace' as well as any other name registered with\n\
6996codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997
6998static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00006999unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007001 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 char *encoding = NULL;
7003 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007004 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007005
Benjamin Peterson308d6372009-09-18 21:42:35 +00007006 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7007 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007009 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007010 if (v == NULL)
7011 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007012 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007013 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007014 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007015 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007016 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007017 Py_DECREF(v);
7018 return NULL;
7019 }
7020 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007021
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007023 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007024}
7025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007026PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007027 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028\n\
7029Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007030If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
7032static PyObject*
7033unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7034{
7035 Py_UNICODE *e;
7036 Py_UNICODE *p;
7037 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007038 Py_UNICODE *qe;
7039 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 PyUnicodeObject *u;
7041 int tabsize = 8;
7042
7043 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045
Thomas Wouters7e474022000-07-16 12:04:32 +00007046 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007047 i = 0; /* chars up to and including most recent \n or \r */
7048 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7049 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 for (p = self->str; p < e; p++)
7051 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 if (tabsize > 0) {
7053 incr = tabsize - (j % tabsize); /* cannot overflow */
7054 if (j > PY_SSIZE_T_MAX - incr)
7055 goto overflow1;
7056 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 if (j > PY_SSIZE_T_MAX - 1)
7061 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 j++;
7063 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 if (i > PY_SSIZE_T_MAX - j)
7065 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007067 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 }
7069 }
7070
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007071 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007073
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 /* Second pass: create output string and fill it */
7075 u = _PyUnicode_New(i + j);
7076 if (!u)
7077 return NULL;
7078
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007079 j = 0; /* same as in first pass */
7080 q = u->str; /* next output char */
7081 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082
7083 for (p = self->str; p < e; p++)
7084 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 if (tabsize > 0) {
7086 i = tabsize - (j % tabsize);
7087 j += i;
7088 while (i--) {
7089 if (q >= qe)
7090 goto overflow2;
7091 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007092 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007094 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 else {
7096 if (q >= qe)
7097 goto overflow2;
7098 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007099 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100 if (*p == '\n' || *p == '\r')
7101 j = 0;
7102 }
7103
7104 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007105
7106 overflow2:
7107 Py_DECREF(u);
7108 overflow1:
7109 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7110 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111}
7112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007113PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115\n\
7116Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007117such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118arguments start and end are interpreted as in slice notation.\n\
7119\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007120Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121
7122static PyObject *
7123unicode_find(PyUnicodeObject *self, PyObject *args)
7124{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007125 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007126 Py_ssize_t start;
7127 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007128 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129
Christian Heimes9cd17752007-11-18 19:35:23 +00007130 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
Thomas Wouters477c8d52006-05-27 19:21:47 +00007133 result = stringlib_find_slice(
7134 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7135 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7136 start, end
7137 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
7139 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007140
Christian Heimes217cfd12007-12-02 14:31:20 +00007141 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142}
7143
7144static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007145unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146{
7147 if (index < 0 || index >= self->length) {
7148 PyErr_SetString(PyExc_IndexError, "string index out of range");
7149 return NULL;
7150 }
7151
7152 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7153}
7154
Guido van Rossumc2504932007-09-18 19:42:40 +00007155/* Believe it or not, this produces the same value for ASCII strings
7156 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007158unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159{
Guido van Rossumc2504932007-09-18 19:42:40 +00007160 Py_ssize_t len;
7161 Py_UNICODE *p;
7162 long x;
7163
7164 if (self->hash != -1)
7165 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007166 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007167 p = self->str;
7168 x = *p << 7;
7169 while (--len >= 0)
7170 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007171 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007172 if (x == -1)
7173 x = -2;
7174 self->hash = x;
7175 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176}
7177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007178PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007181Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182
7183static PyObject *
7184unicode_index(PyUnicodeObject *self, PyObject *args)
7185{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007186 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007187 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007188 Py_ssize_t start;
7189 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190
Christian Heimes9cd17752007-11-18 19:35:23 +00007191 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193
Thomas Wouters477c8d52006-05-27 19:21:47 +00007194 result = stringlib_find_slice(
7195 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7196 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7197 start, end
7198 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199
7200 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007201
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 if (result < 0) {
7203 PyErr_SetString(PyExc_ValueError, "substring not found");
7204 return NULL;
7205 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007206
Christian Heimes217cfd12007-12-02 14:31:20 +00007207 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208}
7209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007213Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007214at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
7216static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007217unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218{
7219 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7220 register const Py_UNICODE *e;
7221 int cased;
7222
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 /* Shortcut for single character strings */
7224 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007227 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007228 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007230
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 e = p + PyUnicode_GET_SIZE(self);
7232 cased = 0;
7233 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007235
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7237 return PyBool_FromLong(0);
7238 else if (!cased && Py_UNICODE_ISLOWER(ch))
7239 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007241 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242}
7243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007244PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007247Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007248at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249
7250static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007251unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252{
7253 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7254 register const Py_UNICODE *e;
7255 int cased;
7256
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 /* Shortcut for single character strings */
7258 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007261 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007262 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007264
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 e = p + PyUnicode_GET_SIZE(self);
7266 cased = 0;
7267 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007269
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7271 return PyBool_FromLong(0);
7272 else if (!cased && Py_UNICODE_ISUPPER(ch))
7273 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007275 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276}
7277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007278PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007281Return True if S is a titlecased string and there is at least one\n\
7282character in S, i.e. upper- and titlecase characters may only\n\
7283follow uncased characters and lowercase characters only cased ones.\n\
7284Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285
7286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007287unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288{
7289 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7290 register const Py_UNICODE *e;
7291 int cased, previous_is_cased;
7292
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 /* Shortcut for single character strings */
7294 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7296 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007298 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007299 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007301
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 e = p + PyUnicode_GET_SIZE(self);
7303 cased = 0;
7304 previous_is_cased = 0;
7305 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007307
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7309 if (previous_is_cased)
7310 return PyBool_FromLong(0);
7311 previous_is_cased = 1;
7312 cased = 1;
7313 }
7314 else if (Py_UNICODE_ISLOWER(ch)) {
7315 if (!previous_is_cased)
7316 return PyBool_FromLong(0);
7317 previous_is_cased = 1;
7318 cased = 1;
7319 }
7320 else
7321 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007323 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324}
7325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007326PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007329Return True if all characters in S are whitespace\n\
7330and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331
7332static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007333unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334{
7335 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7336 register const Py_UNICODE *e;
7337
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338 /* Shortcut for single character strings */
7339 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 Py_UNICODE_ISSPACE(*p))
7341 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007343 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007344 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007346
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 e = p + PyUnicode_GET_SIZE(self);
7348 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 if (!Py_UNICODE_ISSPACE(*p))
7350 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007352 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353}
7354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007355PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007357\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007358Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007359and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007360
7361static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007362unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007363{
7364 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7365 register const Py_UNICODE *e;
7366
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007367 /* Shortcut for single character strings */
7368 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 Py_UNICODE_ISALPHA(*p))
7370 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007371
7372 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007373 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007375
7376 e = p + PyUnicode_GET_SIZE(self);
7377 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 if (!Py_UNICODE_ISALPHA(*p))
7379 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007380 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007381 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007382}
7383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007384PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007386\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007387Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007388and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007389
7390static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007391unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007392{
7393 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7394 register const Py_UNICODE *e;
7395
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007396 /* Shortcut for single character strings */
7397 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 Py_UNICODE_ISALNUM(*p))
7399 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007400
7401 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007402 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007404
7405 e = p + PyUnicode_GET_SIZE(self);
7406 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 if (!Py_UNICODE_ISALNUM(*p))
7408 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007409 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007410 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007411}
7412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007413PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007416Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007417False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
7419static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007420unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421{
7422 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7423 register const Py_UNICODE *e;
7424
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425 /* Shortcut for single character strings */
7426 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 Py_UNICODE_ISDECIMAL(*p))
7428 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007430 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007431 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007433
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 e = p + PyUnicode_GET_SIZE(self);
7435 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 if (!Py_UNICODE_ISDECIMAL(*p))
7437 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007439 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440}
7441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007442PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007445Return True if all characters in S are digits\n\
7446and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447
7448static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007449unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450{
7451 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7452 register const Py_UNICODE *e;
7453
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 /* Shortcut for single character strings */
7455 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 Py_UNICODE_ISDIGIT(*p))
7457 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007459 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007460 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007462
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 e = p + PyUnicode_GET_SIZE(self);
7464 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 if (!Py_UNICODE_ISDIGIT(*p))
7466 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007468 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469}
7470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007471PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007474Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007475False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476
7477static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007478unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479{
7480 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7481 register const Py_UNICODE *e;
7482
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 /* Shortcut for single character strings */
7484 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 Py_UNICODE_ISNUMERIC(*p))
7486 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007488 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007489 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007491
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 e = p + PyUnicode_GET_SIZE(self);
7493 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 if (!Py_UNICODE_ISNUMERIC(*p))
7495 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007497 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498}
7499
Martin v. Löwis47383402007-08-15 07:32:56 +00007500int
7501PyUnicode_IsIdentifier(PyObject *self)
7502{
7503 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7504 register const Py_UNICODE *e;
7505
7506 /* Special case for empty strings */
7507 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007509
7510 /* PEP 3131 says that the first character must be in
7511 XID_Start and subsequent characters in XID_Continue,
7512 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007513 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007514 letters, digits, underscore). However, given the current
7515 definition of XID_Start and XID_Continue, it is sufficient
7516 to check just for these, except that _ must be allowed
7517 as starting an identifier. */
7518 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7519 return 0;
7520
7521 e = p + PyUnicode_GET_SIZE(self);
7522 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 if (!_PyUnicode_IsXidContinue(*p))
7524 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007525 }
7526 return 1;
7527}
7528
7529PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007531\n\
7532Return True if S is a valid identifier according\n\
7533to the language definition.");
7534
7535static PyObject*
7536unicode_isidentifier(PyObject *self)
7537{
7538 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7539}
7540
Georg Brandl559e5d72008-06-11 18:37:52 +00007541PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007543\n\
7544Return True if all characters in S are considered\n\
7545printable in repr() or S is empty, False otherwise.");
7546
7547static PyObject*
7548unicode_isprintable(PyObject *self)
7549{
7550 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7551 register const Py_UNICODE *e;
7552
7553 /* Shortcut for single character strings */
7554 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7555 Py_RETURN_TRUE;
7556 }
7557
7558 e = p + PyUnicode_GET_SIZE(self);
7559 for (; p < e; p++) {
7560 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7561 Py_RETURN_FALSE;
7562 }
7563 }
7564 Py_RETURN_TRUE;
7565}
7566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007567PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007568 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569\n\
7570Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007571iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
7573static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007574unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007576 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577}
7578
Martin v. Löwis18e16552006-02-15 17:27:45 +00007579static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580unicode_length(PyUnicodeObject *self)
7581{
7582 return self->length;
7583}
7584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007585PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007588Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007589done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
7591static PyObject *
7592unicode_ljust(PyUnicodeObject *self, PyObject *args)
7593{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007594 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007595 Py_UNICODE fillchar = ' ';
7596
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007597 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 return NULL;
7599
Tim Peters7a29bd52001-09-12 03:03:31 +00007600 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 Py_INCREF(self);
7602 return (PyObject*) self;
7603 }
7604
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007605 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606}
7607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007608PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007611Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612
7613static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007614unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 return fixup(self, fixlower);
7617}
7618
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007619#define LEFTSTRIP 0
7620#define RIGHTSTRIP 1
7621#define BOTHSTRIP 2
7622
7623/* Arrays indexed by above */
7624static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7625
7626#define STRIPNAME(i) (stripformat[i]+3)
7627
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007628/* externally visible for str.strip(unicode) */
7629PyObject *
7630_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7631{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007632 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7633 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7634 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7635 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7636 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007637
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007639
Benjamin Peterson14339b62009-01-31 16:36:08 +00007640 i = 0;
7641 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7643 i++;
7644 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007645 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007646
Benjamin Peterson14339b62009-01-31 16:36:08 +00007647 j = len;
7648 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 do {
7650 j--;
7651 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7652 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007653 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007654
Benjamin Peterson14339b62009-01-31 16:36:08 +00007655 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 Py_INCREF(self);
7657 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007658 }
7659 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007661}
7662
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663
7664static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007665do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007667 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7668 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007669
Benjamin Peterson14339b62009-01-31 16:36:08 +00007670 i = 0;
7671 if (striptype != RIGHTSTRIP) {
7672 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7673 i++;
7674 }
7675 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007676
Benjamin Peterson14339b62009-01-31 16:36:08 +00007677 j = len;
7678 if (striptype != LEFTSTRIP) {
7679 do {
7680 j--;
7681 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7682 j++;
7683 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007684
Benjamin Peterson14339b62009-01-31 16:36:08 +00007685 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7686 Py_INCREF(self);
7687 return (PyObject*)self;
7688 }
7689 else
7690 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691}
7692
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007693
7694static PyObject *
7695do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7696{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007697 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007698
Benjamin Peterson14339b62009-01-31 16:36:08 +00007699 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7700 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007701
Benjamin Peterson14339b62009-01-31 16:36:08 +00007702 if (sep != NULL && sep != Py_None) {
7703 if (PyUnicode_Check(sep))
7704 return _PyUnicode_XStrip(self, striptype, sep);
7705 else {
7706 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 "%s arg must be None or str",
7708 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007709 return NULL;
7710 }
7711 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007712
Benjamin Peterson14339b62009-01-31 16:36:08 +00007713 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007714}
7715
7716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007717PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007719\n\
7720Return a copy of the string S with leading and trailing\n\
7721whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007722If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007723
7724static PyObject *
7725unicode_strip(PyUnicodeObject *self, PyObject *args)
7726{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 if (PyTuple_GET_SIZE(args) == 0)
7728 return do_strip(self, BOTHSTRIP); /* Common case */
7729 else
7730 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007731}
7732
7733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007734PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007736\n\
7737Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007738If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007739
7740static PyObject *
7741unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007743 if (PyTuple_GET_SIZE(args) == 0)
7744 return do_strip(self, LEFTSTRIP); /* Common case */
7745 else
7746 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007747}
7748
7749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007750PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007752\n\
7753Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007754If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007755
7756static PyObject *
7757unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7758{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007759 if (PyTuple_GET_SIZE(args) == 0)
7760 return do_strip(self, RIGHTSTRIP); /* Common case */
7761 else
7762 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007763}
7764
7765
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007767unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768{
7769 PyUnicodeObject *u;
7770 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007771 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007772 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773
Georg Brandl222de0f2009-04-12 12:01:50 +00007774 if (len < 1) {
7775 Py_INCREF(unicode_empty);
7776 return (PyObject *)unicode_empty;
7777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778
Tim Peters7a29bd52001-09-12 03:03:31 +00007779 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 /* no repeat, return original string */
7781 Py_INCREF(str);
7782 return (PyObject*) str;
7783 }
Tim Peters8f422462000-09-09 06:13:41 +00007784
7785 /* ensure # of chars needed doesn't overflow int and # of bytes
7786 * needed doesn't overflow size_t
7787 */
7788 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007789 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007790 PyErr_SetString(PyExc_OverflowError,
7791 "repeated string is too long");
7792 return NULL;
7793 }
7794 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7795 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7796 PyErr_SetString(PyExc_OverflowError,
7797 "repeated string is too long");
7798 return NULL;
7799 }
7800 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 if (!u)
7802 return NULL;
7803
7804 p = u->str;
7805
Georg Brandl222de0f2009-04-12 12:01:50 +00007806 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007807 Py_UNICODE_FILL(p, str->str[0], len);
7808 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007809 Py_ssize_t done = str->length; /* number of characters copied this far */
7810 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007812 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007813 Py_UNICODE_COPY(p+done, p, n);
7814 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 }
7817
7818 return (PyObject*) u;
7819}
7820
7821PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 PyObject *subobj,
7823 PyObject *replobj,
7824 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825{
7826 PyObject *self;
7827 PyObject *str1;
7828 PyObject *str2;
7829 PyObject *result;
7830
7831 self = PyUnicode_FromObject(obj);
7832 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 str1 = PyUnicode_FromObject(subobj);
7835 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 Py_DECREF(self);
7837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 }
7839 str2 = PyUnicode_FromObject(replobj);
7840 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 Py_DECREF(self);
7842 Py_DECREF(str1);
7843 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 }
Tim Petersced69f82003-09-16 20:30:58 +00007845 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 (PyUnicodeObject *)str1,
7847 (PyUnicodeObject *)str2,
7848 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 Py_DECREF(self);
7850 Py_DECREF(str1);
7851 Py_DECREF(str2);
7852 return result;
7853}
7854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007855PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857\n\
7858Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007859old replaced by new. If the optional argument count is\n\
7860given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861
7862static PyObject*
7863unicode_replace(PyUnicodeObject *self, PyObject *args)
7864{
7865 PyUnicodeObject *str1;
7866 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007867 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 PyObject *result;
7869
Martin v. Löwis18e16552006-02-15 17:27:45 +00007870 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 return NULL;
7872 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7873 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007876 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 Py_DECREF(str1);
7878 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880
7881 result = replace(self, str1, str2, maxcount);
7882
7883 Py_DECREF(str1);
7884 Py_DECREF(str2);
7885 return result;
7886}
7887
7888static
7889PyObject *unicode_repr(PyObject *unicode)
7890{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007891 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007892 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007893 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7894 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7895
7896 /* XXX(nnorwitz): rather than over-allocating, it would be
7897 better to choose a different scheme. Perhaps scan the
7898 first N-chars of the string and allocate based on that size.
7899 */
7900 /* Initial allocation is based on the longest-possible unichr
7901 escape.
7902
7903 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7904 unichr, so in this case it's the longest unichr escape. In
7905 narrow (UTF-16) builds this is five chars per source unichr
7906 since there are two unichrs in the surrogate pair, so in narrow
7907 (UTF-16) builds it's not the longest unichr escape.
7908
7909 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7910 so in the narrow (UTF-16) build case it's the longest unichr
7911 escape.
7912 */
7913
Walter Dörwald1ab83302007-05-18 17:15:44 +00007914 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007916#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007918#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007920#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007922 if (repr == NULL)
7923 return NULL;
7924
Walter Dörwald1ab83302007-05-18 17:15:44 +00007925 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007926
7927 /* Add quote */
7928 *p++ = (findchar(s, size, '\'') &&
7929 !findchar(s, size, '"')) ? '"' : '\'';
7930 while (size-- > 0) {
7931 Py_UNICODE ch = *s++;
7932
7933 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007934 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007935 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007936 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007937 continue;
7938 }
7939
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007941 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007942 *p++ = '\\';
7943 *p++ = 't';
7944 }
7945 else if (ch == '\n') {
7946 *p++ = '\\';
7947 *p++ = 'n';
7948 }
7949 else if (ch == '\r') {
7950 *p++ = '\\';
7951 *p++ = 'r';
7952 }
7953
7954 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007955 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007956 *p++ = '\\';
7957 *p++ = 'x';
7958 *p++ = hexdigits[(ch >> 4) & 0x000F];
7959 *p++ = hexdigits[ch & 0x000F];
7960 }
7961
Georg Brandl559e5d72008-06-11 18:37:52 +00007962 /* Copy ASCII characters as-is */
7963 else if (ch < 0x7F) {
7964 *p++ = ch;
7965 }
7966
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007968 else {
7969 Py_UCS4 ucs = ch;
7970
7971#ifndef Py_UNICODE_WIDE
7972 Py_UNICODE ch2 = 0;
7973 /* Get code point from surrogate pair */
7974 if (size > 0) {
7975 ch2 = *s;
7976 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007978 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007980 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007981 size--;
7982 }
7983 }
7984#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007985 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007986 (categories Z* and C* except ASCII space)
7987 */
7988 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7989 /* Map 8-bit characters to '\xhh' */
7990 if (ucs <= 0xff) {
7991 *p++ = '\\';
7992 *p++ = 'x';
7993 *p++ = hexdigits[(ch >> 4) & 0x000F];
7994 *p++ = hexdigits[ch & 0x000F];
7995 }
7996 /* Map 21-bit characters to '\U00xxxxxx' */
7997 else if (ucs >= 0x10000) {
7998 *p++ = '\\';
7999 *p++ = 'U';
8000 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8001 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8002 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8003 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8004 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8005 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8006 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8007 *p++ = hexdigits[ucs & 0x0000000F];
8008 }
8009 /* Map 16-bit characters to '\uxxxx' */
8010 else {
8011 *p++ = '\\';
8012 *p++ = 'u';
8013 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8014 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8015 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8016 *p++ = hexdigits[ucs & 0x000F];
8017 }
8018 }
8019 /* Copy characters as-is */
8020 else {
8021 *p++ = ch;
8022#ifndef Py_UNICODE_WIDE
8023 if (ucs >= 0x10000)
8024 *p++ = ch2;
8025#endif
8026 }
8027 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008028 }
8029 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008030 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008031
8032 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008033 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008034 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035}
8036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008037PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039\n\
8040Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008041such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042arguments start and end are interpreted as in slice notation.\n\
8043\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008044Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045
8046static PyObject *
8047unicode_rfind(PyUnicodeObject *self, PyObject *args)
8048{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008049 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008050 Py_ssize_t start;
8051 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008052 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053
Christian Heimes9cd17752007-11-18 19:35:23 +00008054 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056
Thomas Wouters477c8d52006-05-27 19:21:47 +00008057 result = stringlib_rfind_slice(
8058 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8059 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8060 start, end
8061 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062
8063 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008064
Christian Heimes217cfd12007-12-02 14:31:20 +00008065 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066}
8067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008068PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008071Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072
8073static PyObject *
8074unicode_rindex(PyUnicodeObject *self, PyObject *args)
8075{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008076 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008077 Py_ssize_t start;
8078 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008079 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080
Christian Heimes9cd17752007-11-18 19:35:23 +00008081 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083
Thomas Wouters477c8d52006-05-27 19:21:47 +00008084 result = stringlib_rfind_slice(
8085 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8086 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8087 start, end
8088 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089
8090 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008091
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 if (result < 0) {
8093 PyErr_SetString(PyExc_ValueError, "substring not found");
8094 return NULL;
8095 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008096 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097}
8098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008099PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008102Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008103done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104
8105static PyObject *
8106unicode_rjust(PyUnicodeObject *self, PyObject *args)
8107{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008108 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008109 Py_UNICODE fillchar = ' ';
8110
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008111 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 return NULL;
8113
Tim Peters7a29bd52001-09-12 03:03:31 +00008114 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 Py_INCREF(self);
8116 return (PyObject*) self;
8117 }
8118
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008119 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120}
8121
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 PyObject *sep,
8124 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125{
8126 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008127
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 s = PyUnicode_FromObject(s);
8129 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 if (sep != NULL) {
8132 sep = PyUnicode_FromObject(sep);
8133 if (sep == NULL) {
8134 Py_DECREF(s);
8135 return NULL;
8136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 }
8138
8139 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8140
8141 Py_DECREF(s);
8142 Py_XDECREF(sep);
8143 return result;
8144}
8145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008146PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148\n\
8149Return a list of the words in S, using sep as the\n\
8150delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008151splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008152whitespace string is a separator and empty strings are\n\
8153removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154
8155static PyObject*
8156unicode_split(PyUnicodeObject *self, PyObject *args)
8157{
8158 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008159 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160
Martin v. Löwis18e16552006-02-15 17:27:45 +00008161 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 return NULL;
8163
8164 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170}
8171
Thomas Wouters477c8d52006-05-27 19:21:47 +00008172PyObject *
8173PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8174{
8175 PyObject* str_obj;
8176 PyObject* sep_obj;
8177 PyObject* out;
8178
8179 str_obj = PyUnicode_FromObject(str_in);
8180 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008182 sep_obj = PyUnicode_FromObject(sep_in);
8183 if (!sep_obj) {
8184 Py_DECREF(str_obj);
8185 return NULL;
8186 }
8187
8188 out = stringlib_partition(
8189 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8190 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8191 );
8192
8193 Py_DECREF(sep_obj);
8194 Py_DECREF(str_obj);
8195
8196 return out;
8197}
8198
8199
8200PyObject *
8201PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8202{
8203 PyObject* str_obj;
8204 PyObject* sep_obj;
8205 PyObject* out;
8206
8207 str_obj = PyUnicode_FromObject(str_in);
8208 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008210 sep_obj = PyUnicode_FromObject(sep_in);
8211 if (!sep_obj) {
8212 Py_DECREF(str_obj);
8213 return NULL;
8214 }
8215
8216 out = stringlib_rpartition(
8217 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8218 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8219 );
8220
8221 Py_DECREF(sep_obj);
8222 Py_DECREF(str_obj);
8223
8224 return out;
8225}
8226
8227PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008229\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008230Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008231the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008232found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008233
8234static PyObject*
8235unicode_partition(PyUnicodeObject *self, PyObject *separator)
8236{
8237 return PyUnicode_Partition((PyObject *)self, separator);
8238}
8239
8240PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008241 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008242\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008243Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008244the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008245separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008246
8247static PyObject*
8248unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8249{
8250 return PyUnicode_RPartition((PyObject *)self, separator);
8251}
8252
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008253PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 PyObject *sep,
8255 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008256{
8257 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008258
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008259 s = PyUnicode_FromObject(s);
8260 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 if (sep != NULL) {
8263 sep = PyUnicode_FromObject(sep);
8264 if (sep == NULL) {
8265 Py_DECREF(s);
8266 return NULL;
8267 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008268 }
8269
8270 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8271
8272 Py_DECREF(s);
8273 Py_XDECREF(sep);
8274 return result;
8275}
8276
8277PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008279\n\
8280Return a list of the words in S, using sep as the\n\
8281delimiter string, starting at the end of the string and\n\
8282working to the front. If maxsplit is given, at most maxsplit\n\
8283splits are done. If sep is not specified, any whitespace string\n\
8284is a separator.");
8285
8286static PyObject*
8287unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8288{
8289 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008290 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008291
Martin v. Löwis18e16552006-02-15 17:27:45 +00008292 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008293 return NULL;
8294
8295 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008297 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008299 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008301}
8302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008303PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305\n\
8306Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008307Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008308is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309
8310static PyObject*
8311unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8312{
Guido van Rossum86662912000-04-11 15:38:46 +00008313 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314
Guido van Rossum86662912000-04-11 15:38:46 +00008315 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 return NULL;
8317
Guido van Rossum86662912000-04-11 15:38:46 +00008318 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319}
8320
8321static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008322PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323{
Walter Dörwald346737f2007-05-31 10:44:43 +00008324 if (PyUnicode_CheckExact(self)) {
8325 Py_INCREF(self);
8326 return self;
8327 } else
8328 /* Subtype -- return genuine unicode string with the same value. */
8329 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8330 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331}
8332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008333PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335\n\
8336Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008337and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338
8339static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008340unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 return fixup(self, fixswapcase);
8343}
8344
Georg Brandlceee0772007-11-27 23:48:05 +00008345PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008347\n\
8348Return a translation table usable for str.translate().\n\
8349If there is only one argument, it must be a dictionary mapping Unicode\n\
8350ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008351Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008352If there are two arguments, they must be strings of equal length, and\n\
8353in the resulting dictionary, each character in x will be mapped to the\n\
8354character at the same position in y. If there is a third argument, it\n\
8355must be a string, whose characters will be mapped to None in the result.");
8356
8357static PyObject*
8358unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8359{
8360 PyObject *x, *y = NULL, *z = NULL;
8361 PyObject *new = NULL, *key, *value;
8362 Py_ssize_t i = 0;
8363 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008364
Georg Brandlceee0772007-11-27 23:48:05 +00008365 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8366 return NULL;
8367 new = PyDict_New();
8368 if (!new)
8369 return NULL;
8370 if (y != NULL) {
8371 /* x must be a string too, of equal length */
8372 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8373 if (!PyUnicode_Check(x)) {
8374 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8375 "be a string if there is a second argument");
8376 goto err;
8377 }
8378 if (PyUnicode_GET_SIZE(x) != ylen) {
8379 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8380 "arguments must have equal length");
8381 goto err;
8382 }
8383 /* create entries for translating chars in x to those in y */
8384 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008385 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8386 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008387 if (!key || !value)
8388 goto err;
8389 res = PyDict_SetItem(new, key, value);
8390 Py_DECREF(key);
8391 Py_DECREF(value);
8392 if (res < 0)
8393 goto err;
8394 }
8395 /* create entries for deleting chars in z */
8396 if (z != NULL) {
8397 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008398 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008399 if (!key)
8400 goto err;
8401 res = PyDict_SetItem(new, key, Py_None);
8402 Py_DECREF(key);
8403 if (res < 0)
8404 goto err;
8405 }
8406 }
8407 } else {
8408 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008409 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008410 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8411 "to maketrans it must be a dict");
8412 goto err;
8413 }
8414 /* copy entries into the new dict, converting string keys to int keys */
8415 while (PyDict_Next(x, &i, &key, &value)) {
8416 if (PyUnicode_Check(key)) {
8417 /* convert string keys to integer keys */
8418 PyObject *newkey;
8419 if (PyUnicode_GET_SIZE(key) != 1) {
8420 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8421 "table must be of length 1");
8422 goto err;
8423 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008424 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008425 if (!newkey)
8426 goto err;
8427 res = PyDict_SetItem(new, newkey, value);
8428 Py_DECREF(newkey);
8429 if (res < 0)
8430 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008431 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008432 /* just keep integer keys */
8433 if (PyDict_SetItem(new, key, value) < 0)
8434 goto err;
8435 } else {
8436 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8437 "be strings or integers");
8438 goto err;
8439 }
8440 }
8441 }
8442 return new;
8443 err:
8444 Py_DECREF(new);
8445 return NULL;
8446}
8447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008448PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450\n\
8451Return a copy of the string S, where all characters have been mapped\n\
8452through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008453Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008454Unmapped characters are left untouched. Characters mapped to None\n\
8455are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456
8457static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008458unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459{
Georg Brandlceee0772007-11-27 23:48:05 +00008460 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461}
8462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008463PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008466Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467
8468static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008469unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471 return fixup(self, fixupper);
8472}
8473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008474PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008477Pad a numeric string S with zeros on the left, to fill a field\n\
8478of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479
8480static PyObject *
8481unicode_zfill(PyUnicodeObject *self, PyObject *args)
8482{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008483 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484 PyUnicodeObject *u;
8485
Martin v. Löwis18e16552006-02-15 17:27:45 +00008486 Py_ssize_t width;
8487 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 return NULL;
8489
8490 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008491 if (PyUnicode_CheckExact(self)) {
8492 Py_INCREF(self);
8493 return (PyObject*) self;
8494 }
8495 else
8496 return PyUnicode_FromUnicode(
8497 PyUnicode_AS_UNICODE(self),
8498 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 }
8501
8502 fill = width - self->length;
8503
8504 u = pad(self, fill, 0, '0');
8505
Walter Dörwald068325e2002-04-15 13:36:47 +00008506 if (u == NULL)
8507 return NULL;
8508
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509 if (u->str[fill] == '+' || u->str[fill] == '-') {
8510 /* move sign to beginning of string */
8511 u->str[0] = u->str[fill];
8512 u->str[fill] = '0';
8513 }
8514
8515 return (PyObject*) u;
8516}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517
8518#if 0
8519static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008520unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521{
Christian Heimes2202f872008-02-06 14:31:34 +00008522 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523}
8524#endif
8525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008526PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008529Return True if S starts with the specified prefix, False otherwise.\n\
8530With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008531With optional end, stop comparing S at that position.\n\
8532prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533
8534static PyObject *
8535unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008538 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008541 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008542 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008544 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8546 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008547 if (PyTuple_Check(subobj)) {
8548 Py_ssize_t i;
8549 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8550 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008552 if (substring == NULL)
8553 return NULL;
8554 result = tailmatch(self, substring, start, end, -1);
8555 Py_DECREF(substring);
8556 if (result) {
8557 Py_RETURN_TRUE;
8558 }
8559 }
8560 /* nothing matched */
8561 Py_RETURN_FALSE;
8562 }
8563 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008566 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008568 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569}
8570
8571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008572PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008575Return True if S ends with the specified suffix, False otherwise.\n\
8576With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008577With optional end, stop comparing S at that position.\n\
8578suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579
8580static PyObject *
8581unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008584 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008586 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008587 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008588 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008590 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8592 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008593 if (PyTuple_Check(subobj)) {
8594 Py_ssize_t i;
8595 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8596 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008598 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008600 result = tailmatch(self, substring, start, end, +1);
8601 Py_DECREF(substring);
8602 if (result) {
8603 Py_RETURN_TRUE;
8604 }
8605 }
8606 Py_RETURN_FALSE;
8607 }
8608 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008612 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008614 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615}
8616
Eric Smith8c663262007-08-25 02:26:07 +00008617#include "stringlib/string_format.h"
8618
8619PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008621\n\
8622");
8623
Eric Smith4a7d76d2008-05-30 18:10:19 +00008624static PyObject *
8625unicode__format__(PyObject* self, PyObject* args)
8626{
8627 PyObject *format_spec;
8628
8629 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8630 return NULL;
8631
8632 return _PyUnicode_FormatAdvanced(self,
8633 PyUnicode_AS_UNICODE(format_spec),
8634 PyUnicode_GET_SIZE(format_spec));
8635}
8636
Eric Smith8c663262007-08-25 02:26:07 +00008637PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008639\n\
8640");
8641
8642static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008643unicode__sizeof__(PyUnicodeObject *v)
8644{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008645 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8646 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008647}
8648
8649PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008651
8652static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008653unicode_getnewargs(PyUnicodeObject *v)
8654{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008655 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008656}
8657
8658
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659static PyMethodDef unicode_methods[] = {
8660
8661 /* Order is according to common usage: often used methods should
8662 appear first, since lookup is done sequentially. */
8663
Benjamin Peterson308d6372009-09-18 21:42:35 +00008664 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008665 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8666 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008667 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008668 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8669 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8670 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8671 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8672 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8673 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8674 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008675 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008676 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8677 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8678 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008679 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008680 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8681 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8682 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008683 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008684 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008685 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008686 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008687 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8688 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8689 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8690 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8691 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8692 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8693 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8694 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8695 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8696 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8697 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8698 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8699 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8700 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008701 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008702 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008703 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008704 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008705 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008706 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8707 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008708 {"maketrans", (PyCFunction) unicode_maketrans,
8709 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008710 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008711#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008712 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713#endif
8714
8715#if 0
8716 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008717 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718#endif
8719
Benjamin Peterson14339b62009-01-31 16:36:08 +00008720 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 {NULL, NULL}
8722};
8723
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008724static PyObject *
8725unicode_mod(PyObject *v, PyObject *w)
8726{
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 if (!PyUnicode_Check(v)) {
8728 Py_INCREF(Py_NotImplemented);
8729 return Py_NotImplemented;
8730 }
8731 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008732}
8733
8734static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008735 0, /*nb_add*/
8736 0, /*nb_subtract*/
8737 0, /*nb_multiply*/
8738 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008739};
8740
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008742 (lenfunc) unicode_length, /* sq_length */
8743 PyUnicode_Concat, /* sq_concat */
8744 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8745 (ssizeargfunc) unicode_getitem, /* sq_item */
8746 0, /* sq_slice */
8747 0, /* sq_ass_item */
8748 0, /* sq_ass_slice */
8749 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750};
8751
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008752static PyObject*
8753unicode_subscript(PyUnicodeObject* self, PyObject* item)
8754{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008755 if (PyIndex_Check(item)) {
8756 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008757 if (i == -1 && PyErr_Occurred())
8758 return NULL;
8759 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008760 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008761 return unicode_getitem(self, i);
8762 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008763 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008764 Py_UNICODE* source_buf;
8765 Py_UNICODE* result_buf;
8766 PyObject* result;
8767
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008768 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008770 return NULL;
8771 }
8772
8773 if (slicelength <= 0) {
8774 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008775 } else if (start == 0 && step == 1 && slicelength == self->length &&
8776 PyUnicode_CheckExact(self)) {
8777 Py_INCREF(self);
8778 return (PyObject *)self;
8779 } else if (step == 1) {
8780 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008781 } else {
8782 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008783 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8784 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008785
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 if (result_buf == NULL)
8787 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008788
8789 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8790 result_buf[i] = source_buf[cur];
8791 }
Tim Petersced69f82003-09-16 20:30:58 +00008792
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008793 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008794 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008795 return result;
8796 }
8797 } else {
8798 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8799 return NULL;
8800 }
8801}
8802
8803static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008804 (lenfunc)unicode_length, /* mp_length */
8805 (binaryfunc)unicode_subscript, /* mp_subscript */
8806 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008807};
8808
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810/* Helpers for PyUnicode_Format() */
8811
8812static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008813getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008815 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 (*p_argidx)++;
8818 if (arglen < 0)
8819 return args;
8820 else
8821 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 }
8823 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 return NULL;
8826}
8827
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008828/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008830static PyObject *
8831formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008833 char *p;
8834 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008836
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837 x = PyFloat_AsDouble(v);
8838 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008839 return NULL;
8840
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008843
Eric Smith0923d1d2009-04-16 20:16:10 +00008844 p = PyOS_double_to_string(x, type, prec,
8845 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008846 if (p == NULL)
8847 return NULL;
8848 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008849 PyMem_Free(p);
8850 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851}
8852
Tim Peters38fd5b62000-09-21 05:43:11 +00008853static PyObject*
8854formatlong(PyObject *val, int flags, int prec, int type)
8855{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008856 char *buf;
8857 int len;
8858 PyObject *str; /* temporary string object. */
8859 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008860
Benjamin Peterson14339b62009-01-31 16:36:08 +00008861 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8862 if (!str)
8863 return NULL;
8864 result = PyUnicode_FromStringAndSize(buf, len);
8865 Py_DECREF(str);
8866 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008867}
8868
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869static int
8870formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008871 size_t buflen,
8872 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008874 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008875 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 if (PyUnicode_GET_SIZE(v) == 1) {
8877 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8878 buf[1] = '\0';
8879 return 1;
8880 }
8881#ifndef Py_UNICODE_WIDE
8882 if (PyUnicode_GET_SIZE(v) == 2) {
8883 /* Decode a valid surrogate pair */
8884 int c0 = PyUnicode_AS_UNICODE(v)[0];
8885 int c1 = PyUnicode_AS_UNICODE(v)[1];
8886 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8887 0xDC00 <= c1 && c1 <= 0xDFFF) {
8888 buf[0] = c0;
8889 buf[1] = c1;
8890 buf[2] = '\0';
8891 return 2;
8892 }
8893 }
8894#endif
8895 goto onError;
8896 }
8897 else {
8898 /* Integer input truncated to a character */
8899 long x;
8900 x = PyLong_AsLong(v);
8901 if (x == -1 && PyErr_Occurred())
8902 goto onError;
8903
8904 if (x < 0 || x > 0x10ffff) {
8905 PyErr_SetString(PyExc_OverflowError,
8906 "%c arg not in range(0x110000)");
8907 return -1;
8908 }
8909
8910#ifndef Py_UNICODE_WIDE
8911 if (x > 0xffff) {
8912 x -= 0x10000;
8913 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8914 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8915 return 2;
8916 }
8917#endif
8918 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008919 buf[1] = '\0';
8920 return 1;
8921 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008922
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008924 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008926 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927}
8928
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008929/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008930 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008931*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008932#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008933
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936{
8937 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008938 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 int args_owned = 0;
8940 PyUnicodeObject *result = NULL;
8941 PyObject *dict = NULL;
8942 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008943
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 PyErr_BadInternalCall();
8946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 }
8948 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008949 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 fmt = PyUnicode_AS_UNICODE(uformat);
8952 fmtcnt = PyUnicode_GET_SIZE(uformat);
8953
8954 reslen = rescnt = fmtcnt + 100;
8955 result = _PyUnicode_New(reslen);
8956 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 res = PyUnicode_AS_UNICODE(result);
8959
8960 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 arglen = PyTuple_Size(args);
8962 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 }
8964 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 arglen = -1;
8966 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008968 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008969 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971
8972 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 if (*fmt != '%') {
8974 if (--rescnt < 0) {
8975 rescnt = fmtcnt + 100;
8976 reslen += rescnt;
8977 if (_PyUnicode_Resize(&result, reslen) < 0)
8978 goto onError;
8979 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8980 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008981 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008983 }
8984 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 /* Got a format specifier */
8986 int flags = 0;
8987 Py_ssize_t width = -1;
8988 int prec = -1;
8989 Py_UNICODE c = '\0';
8990 Py_UNICODE fill;
8991 int isnumok;
8992 PyObject *v = NULL;
8993 PyObject *temp = NULL;
8994 Py_UNICODE *pbuf;
8995 Py_UNICODE sign;
8996 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008997 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 fmt++;
9000 if (*fmt == '(') {
9001 Py_UNICODE *keystart;
9002 Py_ssize_t keylen;
9003 PyObject *key;
9004 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009005
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 if (dict == NULL) {
9007 PyErr_SetString(PyExc_TypeError,
9008 "format requires a mapping");
9009 goto onError;
9010 }
9011 ++fmt;
9012 --fmtcnt;
9013 keystart = fmt;
9014 /* Skip over balanced parentheses */
9015 while (pcount > 0 && --fmtcnt >= 0) {
9016 if (*fmt == ')')
9017 --pcount;
9018 else if (*fmt == '(')
9019 ++pcount;
9020 fmt++;
9021 }
9022 keylen = fmt - keystart - 1;
9023 if (fmtcnt < 0 || pcount > 0) {
9024 PyErr_SetString(PyExc_ValueError,
9025 "incomplete format key");
9026 goto onError;
9027 }
9028#if 0
9029 /* keys are converted to strings using UTF-8 and
9030 then looked up since Python uses strings to hold
9031 variables names etc. in its namespaces and we
9032 wouldn't want to break common idioms. */
9033 key = PyUnicode_EncodeUTF8(keystart,
9034 keylen,
9035 NULL);
9036#else
9037 key = PyUnicode_FromUnicode(keystart, keylen);
9038#endif
9039 if (key == NULL)
9040 goto onError;
9041 if (args_owned) {
9042 Py_DECREF(args);
9043 args_owned = 0;
9044 }
9045 args = PyObject_GetItem(dict, key);
9046 Py_DECREF(key);
9047 if (args == NULL) {
9048 goto onError;
9049 }
9050 args_owned = 1;
9051 arglen = -1;
9052 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009053 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009054 while (--fmtcnt >= 0) {
9055 switch (c = *fmt++) {
9056 case '-': flags |= F_LJUST; continue;
9057 case '+': flags |= F_SIGN; continue;
9058 case ' ': flags |= F_BLANK; continue;
9059 case '#': flags |= F_ALT; continue;
9060 case '0': flags |= F_ZERO; continue;
9061 }
9062 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009063 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009064 if (c == '*') {
9065 v = getnextarg(args, arglen, &argidx);
9066 if (v == NULL)
9067 goto onError;
9068 if (!PyLong_Check(v)) {
9069 PyErr_SetString(PyExc_TypeError,
9070 "* wants int");
9071 goto onError;
9072 }
9073 width = PyLong_AsLong(v);
9074 if (width == -1 && PyErr_Occurred())
9075 goto onError;
9076 if (width < 0) {
9077 flags |= F_LJUST;
9078 width = -width;
9079 }
9080 if (--fmtcnt >= 0)
9081 c = *fmt++;
9082 }
9083 else if (c >= '0' && c <= '9') {
9084 width = c - '0';
9085 while (--fmtcnt >= 0) {
9086 c = *fmt++;
9087 if (c < '0' || c > '9')
9088 break;
9089 if ((width*10) / 10 != width) {
9090 PyErr_SetString(PyExc_ValueError,
9091 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009092 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 }
9094 width = width*10 + (c - '0');
9095 }
9096 }
9097 if (c == '.') {
9098 prec = 0;
9099 if (--fmtcnt >= 0)
9100 c = *fmt++;
9101 if (c == '*') {
9102 v = getnextarg(args, arglen, &argidx);
9103 if (v == NULL)
9104 goto onError;
9105 if (!PyLong_Check(v)) {
9106 PyErr_SetString(PyExc_TypeError,
9107 "* wants int");
9108 goto onError;
9109 }
9110 prec = PyLong_AsLong(v);
9111 if (prec == -1 && PyErr_Occurred())
9112 goto onError;
9113 if (prec < 0)
9114 prec = 0;
9115 if (--fmtcnt >= 0)
9116 c = *fmt++;
9117 }
9118 else if (c >= '0' && c <= '9') {
9119 prec = c - '0';
9120 while (--fmtcnt >= 0) {
9121 c = Py_CHARMASK(*fmt++);
9122 if (c < '0' || c > '9')
9123 break;
9124 if ((prec*10) / 10 != prec) {
9125 PyErr_SetString(PyExc_ValueError,
9126 "prec too big");
9127 goto onError;
9128 }
9129 prec = prec*10 + (c - '0');
9130 }
9131 }
9132 } /* prec */
9133 if (fmtcnt >= 0) {
9134 if (c == 'h' || c == 'l' || c == 'L') {
9135 if (--fmtcnt >= 0)
9136 c = *fmt++;
9137 }
9138 }
9139 if (fmtcnt < 0) {
9140 PyErr_SetString(PyExc_ValueError,
9141 "incomplete format");
9142 goto onError;
9143 }
9144 if (c != '%') {
9145 v = getnextarg(args, arglen, &argidx);
9146 if (v == NULL)
9147 goto onError;
9148 }
9149 sign = 0;
9150 fill = ' ';
9151 switch (c) {
9152
9153 case '%':
9154 pbuf = formatbuf;
9155 /* presume that buffer length is at least 1 */
9156 pbuf[0] = '%';
9157 len = 1;
9158 break;
9159
9160 case 's':
9161 case 'r':
9162 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009163 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 temp = v;
9165 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009166 }
9167 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 if (c == 's')
9169 temp = PyObject_Str(v);
9170 else if (c == 'r')
9171 temp = PyObject_Repr(v);
9172 else
9173 temp = PyObject_ASCII(v);
9174 if (temp == NULL)
9175 goto onError;
9176 if (PyUnicode_Check(temp))
9177 /* nothing to do */;
9178 else {
9179 Py_DECREF(temp);
9180 PyErr_SetString(PyExc_TypeError,
9181 "%s argument has non-string str()");
9182 goto onError;
9183 }
9184 }
9185 pbuf = PyUnicode_AS_UNICODE(temp);
9186 len = PyUnicode_GET_SIZE(temp);
9187 if (prec >= 0 && len > prec)
9188 len = prec;
9189 break;
9190
9191 case 'i':
9192 case 'd':
9193 case 'u':
9194 case 'o':
9195 case 'x':
9196 case 'X':
9197 if (c == 'i')
9198 c = 'd';
9199 isnumok = 0;
9200 if (PyNumber_Check(v)) {
9201 PyObject *iobj=NULL;
9202
9203 if (PyLong_Check(v)) {
9204 iobj = v;
9205 Py_INCREF(iobj);
9206 }
9207 else {
9208 iobj = PyNumber_Long(v);
9209 }
9210 if (iobj!=NULL) {
9211 if (PyLong_Check(iobj)) {
9212 isnumok = 1;
9213 temp = formatlong(iobj, flags, prec, c);
9214 Py_DECREF(iobj);
9215 if (!temp)
9216 goto onError;
9217 pbuf = PyUnicode_AS_UNICODE(temp);
9218 len = PyUnicode_GET_SIZE(temp);
9219 sign = 1;
9220 }
9221 else {
9222 Py_DECREF(iobj);
9223 }
9224 }
9225 }
9226 if (!isnumok) {
9227 PyErr_Format(PyExc_TypeError,
9228 "%%%c format: a number is required, "
9229 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9230 goto onError;
9231 }
9232 if (flags & F_ZERO)
9233 fill = '0';
9234 break;
9235
9236 case 'e':
9237 case 'E':
9238 case 'f':
9239 case 'F':
9240 case 'g':
9241 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009242 temp = formatfloat(v, flags, prec, c);
9243 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009245 pbuf = PyUnicode_AS_UNICODE(temp);
9246 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009247 sign = 1;
9248 if (flags & F_ZERO)
9249 fill = '0';
9250 break;
9251
9252 case 'c':
9253 pbuf = formatbuf;
9254 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9255 if (len < 0)
9256 goto onError;
9257 break;
9258
9259 default:
9260 PyErr_Format(PyExc_ValueError,
9261 "unsupported format character '%c' (0x%x) "
9262 "at index %zd",
9263 (31<=c && c<=126) ? (char)c : '?',
9264 (int)c,
9265 (Py_ssize_t)(fmt - 1 -
9266 PyUnicode_AS_UNICODE(uformat)));
9267 goto onError;
9268 }
9269 if (sign) {
9270 if (*pbuf == '-' || *pbuf == '+') {
9271 sign = *pbuf++;
9272 len--;
9273 }
9274 else if (flags & F_SIGN)
9275 sign = '+';
9276 else if (flags & F_BLANK)
9277 sign = ' ';
9278 else
9279 sign = 0;
9280 }
9281 if (width < len)
9282 width = len;
9283 if (rescnt - (sign != 0) < width) {
9284 reslen -= rescnt;
9285 rescnt = width + fmtcnt + 100;
9286 reslen += rescnt;
9287 if (reslen < 0) {
9288 Py_XDECREF(temp);
9289 PyErr_NoMemory();
9290 goto onError;
9291 }
9292 if (_PyUnicode_Resize(&result, reslen) < 0) {
9293 Py_XDECREF(temp);
9294 goto onError;
9295 }
9296 res = PyUnicode_AS_UNICODE(result)
9297 + reslen - rescnt;
9298 }
9299 if (sign) {
9300 if (fill != ' ')
9301 *res++ = sign;
9302 rescnt--;
9303 if (width > len)
9304 width--;
9305 }
9306 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9307 assert(pbuf[0] == '0');
9308 assert(pbuf[1] == c);
9309 if (fill != ' ') {
9310 *res++ = *pbuf++;
9311 *res++ = *pbuf++;
9312 }
9313 rescnt -= 2;
9314 width -= 2;
9315 if (width < 0)
9316 width = 0;
9317 len -= 2;
9318 }
9319 if (width > len && !(flags & F_LJUST)) {
9320 do {
9321 --rescnt;
9322 *res++ = fill;
9323 } while (--width > len);
9324 }
9325 if (fill == ' ') {
9326 if (sign)
9327 *res++ = sign;
9328 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9329 assert(pbuf[0] == '0');
9330 assert(pbuf[1] == c);
9331 *res++ = *pbuf++;
9332 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009333 }
9334 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009335 Py_UNICODE_COPY(res, pbuf, len);
9336 res += len;
9337 rescnt -= len;
9338 while (--width >= len) {
9339 --rescnt;
9340 *res++ = ' ';
9341 }
9342 if (dict && (argidx < arglen) && c != '%') {
9343 PyErr_SetString(PyExc_TypeError,
9344 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009345 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 goto onError;
9347 }
9348 Py_XDECREF(temp);
9349 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350 } /* until end */
9351 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 PyErr_SetString(PyExc_TypeError,
9353 "not all arguments converted during string formatting");
9354 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355 }
9356
Thomas Woutersa96affe2006-03-12 00:29:36 +00009357 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009358 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009360 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 }
9362 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363 return (PyObject *)result;
9364
Benjamin Peterson29060642009-01-31 22:14:21 +00009365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366 Py_XDECREF(result);
9367 Py_DECREF(uformat);
9368 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 }
9371 return NULL;
9372}
9373
Jeremy Hylton938ace62002-07-17 16:30:39 +00009374static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009375unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9376
Tim Peters6d6c1a32001-08-02 04:15:00 +00009377static PyObject *
9378unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9379{
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009381 static char *kwlist[] = {"object", "encoding", "errors", 0};
9382 char *encoding = NULL;
9383 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009384
Benjamin Peterson14339b62009-01-31 16:36:08 +00009385 if (type != &PyUnicode_Type)
9386 return unicode_subtype_new(type, args, kwds);
9387 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009389 return NULL;
9390 if (x == NULL)
9391 return (PyObject *)_PyUnicode_New(0);
9392 if (encoding == NULL && errors == NULL)
9393 return PyObject_Str(x);
9394 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009396}
9397
Guido van Rossume023fe02001-08-30 03:12:59 +00009398static PyObject *
9399unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9400{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009401 PyUnicodeObject *tmp, *pnew;
9402 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009403
Benjamin Peterson14339b62009-01-31 16:36:08 +00009404 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9405 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9406 if (tmp == NULL)
9407 return NULL;
9408 assert(PyUnicode_Check(tmp));
9409 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9410 if (pnew == NULL) {
9411 Py_DECREF(tmp);
9412 return NULL;
9413 }
9414 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9415 if (pnew->str == NULL) {
9416 _Py_ForgetReference((PyObject *)pnew);
9417 PyObject_Del(pnew);
9418 Py_DECREF(tmp);
9419 return PyErr_NoMemory();
9420 }
9421 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9422 pnew->length = n;
9423 pnew->hash = tmp->hash;
9424 Py_DECREF(tmp);
9425 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009426}
9427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009428PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009430\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009431Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009432encoding defaults to the current default string encoding.\n\
9433errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009434
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009435static PyObject *unicode_iter(PyObject *seq);
9436
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009438 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009439 "str", /* tp_name */
9440 sizeof(PyUnicodeObject), /* tp_size */
9441 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009443 (destructor)unicode_dealloc, /* tp_dealloc */
9444 0, /* tp_print */
9445 0, /* tp_getattr */
9446 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009447 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009448 unicode_repr, /* tp_repr */
9449 &unicode_as_number, /* tp_as_number */
9450 &unicode_as_sequence, /* tp_as_sequence */
9451 &unicode_as_mapping, /* tp_as_mapping */
9452 (hashfunc) unicode_hash, /* tp_hash*/
9453 0, /* tp_call*/
9454 (reprfunc) unicode_str, /* tp_str */
9455 PyObject_GenericGetAttr, /* tp_getattro */
9456 0, /* tp_setattro */
9457 0, /* tp_as_buffer */
9458 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009459 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009460 unicode_doc, /* tp_doc */
9461 0, /* tp_traverse */
9462 0, /* tp_clear */
9463 PyUnicode_RichCompare, /* tp_richcompare */
9464 0, /* tp_weaklistoffset */
9465 unicode_iter, /* tp_iter */
9466 0, /* tp_iternext */
9467 unicode_methods, /* tp_methods */
9468 0, /* tp_members */
9469 0, /* tp_getset */
9470 &PyBaseObject_Type, /* tp_base */
9471 0, /* tp_dict */
9472 0, /* tp_descr_get */
9473 0, /* tp_descr_set */
9474 0, /* tp_dictoffset */
9475 0, /* tp_init */
9476 0, /* tp_alloc */
9477 unicode_new, /* tp_new */
9478 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479};
9480
9481/* Initialize the Unicode implementation */
9482
Thomas Wouters78890102000-07-22 19:25:51 +00009483void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009485 int i;
9486
Thomas Wouters477c8d52006-05-27 19:21:47 +00009487 /* XXX - move this array to unicodectype.c ? */
9488 Py_UNICODE linebreak[] = {
9489 0x000A, /* LINE FEED */
9490 0x000D, /* CARRIAGE RETURN */
9491 0x001C, /* FILE SEPARATOR */
9492 0x001D, /* GROUP SEPARATOR */
9493 0x001E, /* RECORD SEPARATOR */
9494 0x0085, /* NEXT LINE */
9495 0x2028, /* LINE SEPARATOR */
9496 0x2029, /* PARAGRAPH SEPARATOR */
9497 };
9498
Fred Drakee4315f52000-05-09 19:53:39 +00009499 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009500 free_list = NULL;
9501 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009503 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009505
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009506 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009508 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009510
9511 /* initialize the linebreak bloom filter */
9512 bloom_linebreak = make_bloom_mask(
9513 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9514 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009515
9516 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517}
9518
9519/* Finalize the Unicode implementation */
9520
Christian Heimesa156e092008-02-16 07:38:31 +00009521int
9522PyUnicode_ClearFreeList(void)
9523{
9524 int freelist_size = numfree;
9525 PyUnicodeObject *u;
9526
9527 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 PyUnicodeObject *v = u;
9529 u = *(PyUnicodeObject **)u;
9530 if (v->str)
9531 PyObject_DEL(v->str);
9532 Py_XDECREF(v->defenc);
9533 PyObject_Del(v);
9534 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009535 }
9536 free_list = NULL;
9537 assert(numfree == 0);
9538 return freelist_size;
9539}
9540
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541void
Thomas Wouters78890102000-07-22 19:25:51 +00009542_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009544 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009546 Py_XDECREF(unicode_empty);
9547 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009548
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009549 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 if (unicode_latin1[i]) {
9551 Py_DECREF(unicode_latin1[i]);
9552 unicode_latin1[i] = NULL;
9553 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009554 }
Christian Heimesa156e092008-02-16 07:38:31 +00009555 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009557
Walter Dörwald16807132007-05-25 13:52:07 +00009558void
9559PyUnicode_InternInPlace(PyObject **p)
9560{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009561 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9562 PyObject *t;
9563 if (s == NULL || !PyUnicode_Check(s))
9564 Py_FatalError(
9565 "PyUnicode_InternInPlace: unicode strings only please!");
9566 /* If it's a subclass, we don't really know what putting
9567 it in the interned dict might do. */
9568 if (!PyUnicode_CheckExact(s))
9569 return;
9570 if (PyUnicode_CHECK_INTERNED(s))
9571 return;
9572 if (interned == NULL) {
9573 interned = PyDict_New();
9574 if (interned == NULL) {
9575 PyErr_Clear(); /* Don't leave an exception */
9576 return;
9577 }
9578 }
9579 /* It might be that the GetItem call fails even
9580 though the key is present in the dictionary,
9581 namely when this happens during a stack overflow. */
9582 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009583 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009584 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009585
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 if (t) {
9587 Py_INCREF(t);
9588 Py_DECREF(*p);
9589 *p = t;
9590 return;
9591 }
Walter Dörwald16807132007-05-25 13:52:07 +00009592
Benjamin Peterson14339b62009-01-31 16:36:08 +00009593 PyThreadState_GET()->recursion_critical = 1;
9594 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9595 PyErr_Clear();
9596 PyThreadState_GET()->recursion_critical = 0;
9597 return;
9598 }
9599 PyThreadState_GET()->recursion_critical = 0;
9600 /* The two references in interned are not counted by refcnt.
9601 The deallocator will take care of this */
9602 Py_REFCNT(s) -= 2;
9603 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009604}
9605
9606void
9607PyUnicode_InternImmortal(PyObject **p)
9608{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009609 PyUnicode_InternInPlace(p);
9610 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9611 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9612 Py_INCREF(*p);
9613 }
Walter Dörwald16807132007-05-25 13:52:07 +00009614}
9615
9616PyObject *
9617PyUnicode_InternFromString(const char *cp)
9618{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009619 PyObject *s = PyUnicode_FromString(cp);
9620 if (s == NULL)
9621 return NULL;
9622 PyUnicode_InternInPlace(&s);
9623 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009624}
9625
9626void _Py_ReleaseInternedUnicodeStrings(void)
9627{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009628 PyObject *keys;
9629 PyUnicodeObject *s;
9630 Py_ssize_t i, n;
9631 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009632
Benjamin Peterson14339b62009-01-31 16:36:08 +00009633 if (interned == NULL || !PyDict_Check(interned))
9634 return;
9635 keys = PyDict_Keys(interned);
9636 if (keys == NULL || !PyList_Check(keys)) {
9637 PyErr_Clear();
9638 return;
9639 }
Walter Dörwald16807132007-05-25 13:52:07 +00009640
Benjamin Peterson14339b62009-01-31 16:36:08 +00009641 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9642 detector, interned unicode strings are not forcibly deallocated;
9643 rather, we give them their stolen references back, and then clear
9644 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009645
Benjamin Peterson14339b62009-01-31 16:36:08 +00009646 n = PyList_GET_SIZE(keys);
9647 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009648 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009649 for (i = 0; i < n; i++) {
9650 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9651 switch (s->state) {
9652 case SSTATE_NOT_INTERNED:
9653 /* XXX Shouldn't happen */
9654 break;
9655 case SSTATE_INTERNED_IMMORTAL:
9656 Py_REFCNT(s) += 1;
9657 immortal_size += s->length;
9658 break;
9659 case SSTATE_INTERNED_MORTAL:
9660 Py_REFCNT(s) += 2;
9661 mortal_size += s->length;
9662 break;
9663 default:
9664 Py_FatalError("Inconsistent interned string state.");
9665 }
9666 s->state = SSTATE_NOT_INTERNED;
9667 }
9668 fprintf(stderr, "total size of all interned strings: "
9669 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9670 "mortal/immortal\n", mortal_size, immortal_size);
9671 Py_DECREF(keys);
9672 PyDict_Clear(interned);
9673 Py_DECREF(interned);
9674 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009675}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009676
9677
9678/********************* Unicode Iterator **************************/
9679
9680typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009681 PyObject_HEAD
9682 Py_ssize_t it_index;
9683 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009684} unicodeiterobject;
9685
9686static void
9687unicodeiter_dealloc(unicodeiterobject *it)
9688{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009689 _PyObject_GC_UNTRACK(it);
9690 Py_XDECREF(it->it_seq);
9691 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009692}
9693
9694static int
9695unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9696{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009697 Py_VISIT(it->it_seq);
9698 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009699}
9700
9701static PyObject *
9702unicodeiter_next(unicodeiterobject *it)
9703{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009704 PyUnicodeObject *seq;
9705 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009706
Benjamin Peterson14339b62009-01-31 16:36:08 +00009707 assert(it != NULL);
9708 seq = it->it_seq;
9709 if (seq == NULL)
9710 return NULL;
9711 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009712
Benjamin Peterson14339b62009-01-31 16:36:08 +00009713 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9714 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716 if (item != NULL)
9717 ++it->it_index;
9718 return item;
9719 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009720
Benjamin Peterson14339b62009-01-31 16:36:08 +00009721 Py_DECREF(seq);
9722 it->it_seq = NULL;
9723 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009724}
9725
9726static PyObject *
9727unicodeiter_len(unicodeiterobject *it)
9728{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009729 Py_ssize_t len = 0;
9730 if (it->it_seq)
9731 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9732 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009733}
9734
9735PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9736
9737static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009738 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009739 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009740 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009741};
9742
9743PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009744 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9745 "str_iterator", /* tp_name */
9746 sizeof(unicodeiterobject), /* tp_basicsize */
9747 0, /* tp_itemsize */
9748 /* methods */
9749 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9750 0, /* tp_print */
9751 0, /* tp_getattr */
9752 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009753 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009754 0, /* tp_repr */
9755 0, /* tp_as_number */
9756 0, /* tp_as_sequence */
9757 0, /* tp_as_mapping */
9758 0, /* tp_hash */
9759 0, /* tp_call */
9760 0, /* tp_str */
9761 PyObject_GenericGetAttr, /* tp_getattro */
9762 0, /* tp_setattro */
9763 0, /* tp_as_buffer */
9764 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9765 0, /* tp_doc */
9766 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9767 0, /* tp_clear */
9768 0, /* tp_richcompare */
9769 0, /* tp_weaklistoffset */
9770 PyObject_SelfIter, /* tp_iter */
9771 (iternextfunc)unicodeiter_next, /* tp_iternext */
9772 unicodeiter_methods, /* tp_methods */
9773 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009774};
9775
9776static PyObject *
9777unicode_iter(PyObject *seq)
9778{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009779 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009780
Benjamin Peterson14339b62009-01-31 16:36:08 +00009781 if (!PyUnicode_Check(seq)) {
9782 PyErr_BadInternalCall();
9783 return NULL;
9784 }
9785 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9786 if (it == NULL)
9787 return NULL;
9788 it->it_index = 0;
9789 Py_INCREF(seq);
9790 it->it_seq = (PyUnicodeObject *)seq;
9791 _PyObject_GC_TRACK(it);
9792 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009793}
9794
Martin v. Löwis5b222132007-06-10 09:51:05 +00009795size_t
9796Py_UNICODE_strlen(const Py_UNICODE *u)
9797{
9798 int res = 0;
9799 while(*u++)
9800 res++;
9801 return res;
9802}
9803
9804Py_UNICODE*
9805Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9806{
9807 Py_UNICODE *u = s1;
9808 while ((*u++ = *s2++));
9809 return s1;
9810}
9811
9812Py_UNICODE*
9813Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9814{
9815 Py_UNICODE *u = s1;
9816 while ((*u++ = *s2++))
9817 if (n-- == 0)
9818 break;
9819 return s1;
9820}
9821
9822int
9823Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9824{
9825 while (*s1 && *s2 && *s1 == *s2)
9826 s1++, s2++;
9827 if (*s1 && *s2)
9828 return (*s1 < *s2) ? -1 : +1;
9829 if (*s1)
9830 return 1;
9831 if (*s2)
9832 return -1;
9833 return 0;
9834}
9835
9836Py_UNICODE*
9837Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9838{
9839 const Py_UNICODE *p;
9840 for (p = s; *p; p++)
9841 if (*p == c)
9842 return (Py_UNICODE*)p;
9843 return NULL;
9844}
9845
9846
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009847#ifdef __cplusplus
9848}
9849#endif