blob: e5c0751f7b85f889a49f0af19b7142d00d7bc16a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner31be90b2010-04-22 19:38:16 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000172/* 0x000B, * LINE TABULATION */
173/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000174/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000175 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000177/* 0x001C, * FILE SEPARATOR */
178/* 0x001D, * GROUP SEPARATOR */
179/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 1, 1, 1, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
196
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000198PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000199{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000200#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 /* This is actually an illegal character, so it should
204 not be passed to unichr. */
205 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000206#endif
207}
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209/* --- Bloom Filters ----------------------------------------------------- */
210
211/* stuff to implement simple "bloom filters" for Unicode characters.
212 to keep things simple, we use a single bitmask, using the least 5
213 bits from each unicode characters as the bit index. */
214
215/* the linebreak mask is set up by Unicode_Init below */
216
Antoine Pitrouf068f942010-01-13 14:19:12 +0000217#if LONG_BIT >= 128
218#define BLOOM_WIDTH 128
219#elif LONG_BIT >= 64
220#define BLOOM_WIDTH 64
221#elif LONG_BIT >= 32
222#define BLOOM_WIDTH 32
223#else
224#error "LONG_BIT is smaller than 32"
225#endif
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227#define BLOOM_MASK unsigned long
228
229static BLOOM_MASK bloom_linebreak;
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
232#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233
Benjamin Peterson29060642009-01-31 22:14:21 +0000234#define BLOOM_LINEBREAK(ch) \
235 ((ch) < 128U ? ascii_linebreak[(ch)] : \
236 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237
238Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
239{
240 /* calculate simple bloom-style bitmask for a given unicode string */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 Py_ssize_t i;
244
245 mask = 0;
246 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000247 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
249 return mask;
250}
251
252Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
253{
254 Py_ssize_t i;
255
256 for (i = 0; i < setlen; i++)
257 if (set[i] == chr)
258 return 1;
259
260 return 0;
261}
262
Benjamin Peterson29060642009-01-31 22:14:21 +0000263#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
265
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266/* --- Unicode Object ----------------------------------------------------- */
267
268static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271{
272 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000276 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 /* Resizing shared object (unicode_empty or single character
279 objects) in-place is not allowed. Use PyUnicode_Resize()
280 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000283 (unicode->length == 1 &&
284 unicode->str[0] < 256U &&
285 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000287 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return -1;
289 }
290
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291 /* We allocate one more byte to make sure the string is Ux0000 terminated.
292 The overallocation is also used by fastsearch, which assumes that it's
293 safe to look at str[length] (without making any assumptions about what
294 it contains). */
295
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000297 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000300 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 PyErr_NoMemory();
302 return -1;
303 }
304 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
Benjamin Peterson29060642009-01-31 22:14:21 +0000307 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000309 if (unicode->defenc) {
310 Py_DECREF(unicode->defenc);
311 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
430 Py_DECREF(unicode->defenc);
431 unicode->defenc = NULL;
432 }
433 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000434 *(PyUnicodeObject **)unicode = free_list;
435 free_list = unicode;
436 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 }
438 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyObject_DEL(unicode->str);
440 Py_XDECREF(unicode->defenc);
441 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 }
443}
444
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445static
446int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447{
448 register PyUnicodeObject *v;
449
450 /* Argument checks */
451 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 PyErr_BadInternalCall();
453 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000455 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000456 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000457 PyErr_BadInternalCall();
458 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Resizing unicode_empty and single character objects is not
462 possible since these are being shared. We simply return a fresh
463 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000464 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000465 (v == unicode_empty || v->length == 1)) {
466 PyUnicodeObject *w = _PyUnicode_New(length);
467 if (w == NULL)
468 return -1;
469 Py_UNICODE_COPY(w->str, v->str,
470 length < v->length ? length : v->length);
471 Py_DECREF(*unicode);
472 *unicode = w;
473 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474 }
475
476 /* Note that we don't have to modify *unicode for unshared Unicode
477 objects, since we can modify them in-place. */
478 return unicode_resize(v, length);
479}
480
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000481int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
482{
483 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
484}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000491 /* If the Unicode data is known at construction time, we can apply
492 some optimizations which share commonly used objects. */
493 if (u != NULL) {
494
Benjamin Peterson29060642009-01-31 22:14:21 +0000495 /* Optimization for empty strings */
496 if (size == 0 && unicode_empty != NULL) {
497 Py_INCREF(unicode_empty);
498 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000499 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000500
501 /* Single character Unicode objects in the Latin-1 range are
502 shared when using this constructor */
503 if (size == 1 && *u < 256) {
504 unicode = unicode_latin1[*u];
505 if (!unicode) {
506 unicode = _PyUnicode_New(1);
507 if (!unicode)
508 return NULL;
509 unicode->str[0] = *u;
510 unicode_latin1[*u] = unicode;
511 }
512 Py_INCREF(unicode);
513 return (PyObject *)unicode;
514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520
521 /* Copy the Unicode data into the new object */
522 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524
525 return (PyObject *)unicode;
526}
527
Walter Dörwaldd2034312007-05-18 16:29:38 +0000528PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529{
530 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 if (size < 0) {
533 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 return NULL;
536 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000537
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000538 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000539 some optimizations which share commonly used objects.
540 Also, this means the input must be UTF-8, so fall back to the
541 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 if (u != NULL) {
543
Benjamin Peterson29060642009-01-31 22:14:21 +0000544 /* Optimization for empty strings */
545 if (size == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
547 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000548 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000549
550 /* Single characters are shared when using this constructor.
551 Restrict to ASCII, since the input must be UTF-8. */
552 if (size == 1 && Py_CHARMASK(*u) < 128) {
553 unicode = unicode_latin1[Py_CHARMASK(*u)];
554 if (!unicode) {
555 unicode = _PyUnicode_New(1);
556 if (!unicode)
557 return NULL;
558 unicode->str[0] = Py_CHARMASK(*u);
559 unicode_latin1[Py_CHARMASK(*u)] = unicode;
560 }
561 Py_INCREF(unicode);
562 return (PyObject *)unicode;
563 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000564
565 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 }
567
Walter Dörwald55507312007-05-18 13:12:10 +0000568 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000569 if (!unicode)
570 return NULL;
571
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000572 return (PyObject *)unicode;
573}
574
Walter Dörwaldd2034312007-05-18 16:29:38 +0000575PyObject *PyUnicode_FromString(const char *u)
576{
577 size_t size = strlen(u);
578 if (size > PY_SSIZE_T_MAX) {
579 PyErr_SetString(PyExc_OverflowError, "input too long");
580 return NULL;
581 }
582
583 return PyUnicode_FromStringAndSize(u, size);
584}
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586#ifdef HAVE_WCHAR_H
587
Mark Dickinson081dfee2009-03-18 14:47:41 +0000588#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
589# define CONVERT_WCHAR_TO_SURROGATES
590#endif
591
592#ifdef CONVERT_WCHAR_TO_SURROGATES
593
594/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
595 to convert from UTF32 to UTF16. */
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599{
600 PyUnicodeObject *unicode;
601 register Py_ssize_t i;
602 Py_ssize_t alloc;
603 const wchar_t *orig_w;
604
605 if (w == NULL) {
606 if (size == 0)
607 return PyUnicode_FromStringAndSize(NULL, 0);
608 PyErr_BadInternalCall();
609 return NULL;
610 }
611
612 if (size == -1) {
613 size = wcslen(w);
614 }
615
616 alloc = size;
617 orig_w = w;
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF)
620 alloc++;
621 w++;
622 }
623 w = orig_w;
624 unicode = _PyUnicode_New(alloc);
625 if (!unicode)
626 return NULL;
627
628 /* Copy the wchar_t data into the new object */
629 {
630 register Py_UNICODE *u;
631 u = PyUnicode_AS_UNICODE(unicode);
632 for (i = size; i > 0; i--) {
633 if (*w > 0xFFFF) {
634 wchar_t ordinal = *w++;
635 ordinal -= 0x10000;
636 *u++ = 0xD800 | (ordinal >> 10);
637 *u++ = 0xDC00 | (ordinal & 0x3FF);
638 }
639 else
640 *u++ = *w++;
641 }
642 }
643 return (PyObject *)unicode;
644}
645
646#else
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650{
651 PyUnicodeObject *unicode;
652
653 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == 0)
655 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000656 PyErr_BadInternalCall();
657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 }
659
Martin v. Löwis790465f2008-04-05 20:41:37 +0000660 if (size == -1) {
661 size = wcslen(w);
662 }
663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 unicode = _PyUnicode_New(size);
665 if (!unicode)
666 return NULL;
667
668 /* Copy the wchar_t data into the new object */
669#ifdef HAVE_USABLE_WCHAR_T
670 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000671#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 register Py_UNICODE *u;
674 register Py_ssize_t i;
675 u = PyUnicode_AS_UNICODE(unicode);
676 for (i = size; i > 0; i--)
677 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 }
679#endif
680
681 return (PyObject *)unicode;
682}
683
Mark Dickinson081dfee2009-03-18 14:47:41 +0000684#endif /* CONVERT_WCHAR_TO_SURROGATES */
685
686#undef CONVERT_WCHAR_TO_SURROGATES
687
Walter Dörwald346737f2007-05-31 10:44:43 +0000688static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000689makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
690 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000691{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000692 *fmt++ = '%';
693 if (width) {
694 if (zeropad)
695 *fmt++ = '0';
696 fmt += sprintf(fmt, "%d", width);
697 }
698 if (precision)
699 fmt += sprintf(fmt, ".%d", precision);
700 if (longflag)
701 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000702 else if (longlongflag) {
703 /* longlongflag should only ever be nonzero on machines with
704 HAVE_LONG_LONG defined */
705#ifdef HAVE_LONG_LONG
706 char *f = PY_FORMAT_LONG_LONG;
707 while (*f)
708 *fmt++ = *f++;
709#else
710 /* we shouldn't ever get here */
711 assert(0);
712 *fmt++ = 'l';
713#endif
714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000715 else if (size_tflag) {
716 char *f = PY_FORMAT_SIZE_T;
717 while (*f)
718 *fmt++ = *f++;
719 }
720 *fmt++ = c;
721 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000722}
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
725
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000726/* size of fixed-size buffer for formatting single arguments */
727#define ITEM_BUFFER_LEN 21
728/* maximum number of characters required for output of %ld. 21 characters
729 allows for 64-bit integers (in decimal) and an optional sign. */
730#define MAX_LONG_CHARS 21
731/* maximum number of characters required for output of %lld.
732 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
733 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
734#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
735
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736PyObject *
737PyUnicode_FromFormatV(const char *format, va_list vargs)
738{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000739 va_list count;
740 Py_ssize_t callcount = 0;
741 PyObject **callresults = NULL;
742 PyObject **callresult = NULL;
743 Py_ssize_t n = 0;
744 int width = 0;
745 int precision = 0;
746 int zeropad;
747 const char* f;
748 Py_UNICODE *s;
749 PyObject *string;
750 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 /* use abuffer instead of buffer, if we need more space
753 * (which can happen if there's a format specifier with width). */
754 char *abuffer = NULL;
755 char *realbuffer;
756 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000757 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759
760#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000761 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762#else
763#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000764 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000766 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767#endif
768#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 /* step 1: count the number of %S/%R/%A/%s format specifications
770 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
771 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
772 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000774 if (*f == '%') {
775 if (*(f+1)=='%')
776 continue;
777 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
778 ++callcount;
779 while (ISDIGIT((unsigned)*f))
780 width = (width*10) + *f++ - '0';
781 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
782 ;
783 if (*f == 's')
784 ++callcount;
785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000786 }
787 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000788 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000789 if (callcount) {
790 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
791 if (!callresults) {
792 PyErr_NoMemory();
793 return NULL;
794 }
795 callresult = callresults;
796 }
797 /* step 3: figure out how large a buffer we need */
798 for (f = format; *f; f++) {
799 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800#ifdef HAVE_LONG_LONG
801 int longlongflag = 0;
802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000803 const char* p = f;
804 width = 0;
805 while (ISDIGIT((unsigned)*f))
806 width = (width*10) + *f++ - '0';
807 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
808 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
811 * they don't affect the amount of space we reserve.
812 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000813 if (*f == 'l') {
814 if (f[1] == 'd' || f[1] == 'u') {
815 ++f;
816 }
817#ifdef HAVE_LONG_LONG
818 else if (f[1] == 'l' &&
819 (f[2] == 'd' || f[2] == 'u')) {
820 longlongflag = 1;
821 f += 2;
822 }
823#endif
824 }
825 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000827 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 switch (*f) {
830 case 'c':
831 (void)va_arg(count, int);
832 /* fall through... */
833 case '%':
834 n++;
835 break;
836 case 'd': case 'u': case 'i': case 'x':
837 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000838#ifdef HAVE_LONG_LONG
839 if (longlongflag) {
840 if (width < MAX_LONG_LONG_CHARS)
841 width = MAX_LONG_LONG_CHARS;
842 }
843 else
844#endif
845 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
846 including sign. Decimal takes the most space. This
847 isn't enough for octal. If a width is specified we
848 need more (which we allocate later). */
849 if (width < MAX_LONG_CHARS)
850 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 if (abuffersize < width)
854 abuffersize = width;
855 break;
856 case 's':
857 {
858 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000859 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000860 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
861 if (!str)
862 goto fail;
863 n += PyUnicode_GET_SIZE(str);
864 /* Remember the str and switch to the next slot */
865 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 assert(obj && PyUnicode_Check(obj));
872 n += PyUnicode_GET_SIZE(obj);
873 break;
874 }
875 case 'V':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 const char *str = va_arg(count, const char *);
879 assert(obj || str);
880 assert(!obj || PyUnicode_Check(obj));
881 if (obj)
882 n += PyUnicode_GET_SIZE(obj);
883 else
884 n += strlen(str);
885 break;
886 }
887 case 'S':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *str;
891 assert(obj);
892 str = PyObject_Str(obj);
893 if (!str)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str);
896 /* Remember the str and switch to the next slot */
897 *callresult++ = str;
898 break;
899 }
900 case 'R':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *repr;
904 assert(obj);
905 repr = PyObject_Repr(obj);
906 if (!repr)
907 goto fail;
908 n += PyUnicode_GET_SIZE(repr);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = repr;
911 break;
912 }
913 case 'A':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *ascii;
917 assert(obj);
918 ascii = PyObject_ASCII(obj);
919 if (!ascii)
920 goto fail;
921 n += PyUnicode_GET_SIZE(ascii);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = ascii;
924 break;
925 }
926 case 'p':
927 (void) va_arg(count, int);
928 /* maximum 64-bit pointer representation:
929 * 0xffffffffffffffff
930 * so 19 characters is enough.
931 * XXX I count 18 -- what's the extra for?
932 */
933 n += 19;
934 break;
935 default:
936 /* if we stumble upon an unknown
937 formatting code, copy the rest of
938 the format string to the output
939 string. (we cannot just skip the
940 code, since there's no way to know
941 what's in the argument list) */
942 n += strlen(p);
943 goto expand;
944 }
945 } else
946 n++;
947 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000949 if (abuffersize > ITEM_BUFFER_LEN) {
950 /* add 1 for sprintf's trailing null byte */
951 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 if (!abuffer) {
953 PyErr_NoMemory();
954 goto fail;
955 }
956 realbuffer = abuffer;
957 }
958 else
959 realbuffer = buffer;
960 /* step 4: fill the buffer */
961 /* Since we've analyzed how much space we need for the worst case,
962 we don't have to resize the string.
963 There can be no errors beyond this point. */
964 string = PyUnicode_FromUnicode(NULL, n);
965 if (!string)
966 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000967
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 s = PyUnicode_AS_UNICODE(string);
969 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 for (f = format; *f; f++) {
972 if (*f == '%') {
973 const char* p = f++;
974 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000976 int size_tflag = 0;
977 zeropad = (*f == '0');
978 /* parse the width.precision part */
979 width = 0;
980 while (ISDIGIT((unsigned)*f))
981 width = (width*10) + *f++ - '0';
982 precision = 0;
983 if (*f == '.') {
984 f++;
985 while (ISDIGIT((unsigned)*f))
986 precision = (precision*10) + *f++ - '0';
987 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 /* Handle %ld, %lu, %lld and %llu. */
989 if (*f == 'l') {
990 if (f[1] == 'd' || f[1] == 'u') {
991 longflag = 1;
992 ++f;
993 }
994#ifdef HAVE_LONG_LONG
995 else if (f[1] == 'l' &&
996 (f[2] == 'd' || f[2] == 'u')) {
997 longlongflag = 1;
998 f += 2;
999 }
1000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 }
1002 /* handle the size_t flag. */
1003 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1004 size_tflag = 1;
1005 ++f;
1006 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001007
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 switch (*f) {
1009 case 'c':
1010 *s++ = va_arg(vargs, int);
1011 break;
1012 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1020#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 else if (size_tflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1023 else
1024 sprintf(realbuffer, fmt, va_arg(vargs, int));
1025 appendstring(realbuffer);
1026 break;
1027 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001028 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1029 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 if (longflag)
1031 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032#ifdef HAVE_LONG_LONG
1033 else if (longlongflag)
1034 sprintf(realbuffer, fmt, va_arg(vargs,
1035 unsigned PY_LONG_LONG));
1036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 else if (size_tflag)
1038 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1039 else
1040 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1041 appendstring(realbuffer);
1042 break;
1043 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001044 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 sprintf(realbuffer, fmt, va_arg(vargs, int));
1051 appendstring(realbuffer);
1052 break;
1053 case 's':
1054 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001055 /* unused, since we already have the result */
1056 (void) va_arg(vargs, char *);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1058 PyUnicode_GET_SIZE(*callresult));
1059 s += PyUnicode_GET_SIZE(*callresult);
1060 /* We're done with the unicode()/repr() => forget it */
1061 Py_DECREF(*callresult);
1062 /* switch to next unicode()/repr() result */
1063 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 break;
1065 }
1066 case 'U':
1067 {
1068 PyObject *obj = va_arg(vargs, PyObject *);
1069 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1071 s += size;
1072 break;
1073 }
1074 case 'V':
1075 {
1076 PyObject *obj = va_arg(vargs, PyObject *);
1077 const char *str = va_arg(vargs, const char *);
1078 if (obj) {
1079 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1081 s += size;
1082 } else {
1083 appendstring(str);
1084 }
1085 break;
1086 }
1087 case 'S':
1088 case 'R':
1089 {
1090 Py_UNICODE *ucopy;
1091 Py_ssize_t usize;
1092 Py_ssize_t upos;
1093 /* unused, since we already have the result */
1094 (void) va_arg(vargs, PyObject *);
1095 ucopy = PyUnicode_AS_UNICODE(*callresult);
1096 usize = PyUnicode_GET_SIZE(*callresult);
1097 for (upos = 0; upos<usize;)
1098 *s++ = ucopy[upos++];
1099 /* We're done with the unicode()/repr() => forget it */
1100 Py_DECREF(*callresult);
1101 /* switch to next unicode()/repr() result */
1102 ++callresult;
1103 break;
1104 }
1105 case 'p':
1106 sprintf(buffer, "%p", va_arg(vargs, void*));
1107 /* %p is ill-defined: ensure leading 0x. */
1108 if (buffer[1] == 'X')
1109 buffer[1] = 'x';
1110 else if (buffer[1] != 'x') {
1111 memmove(buffer+2, buffer, strlen(buffer)+1);
1112 buffer[0] = '0';
1113 buffer[1] = 'x';
1114 }
1115 appendstring(buffer);
1116 break;
1117 case '%':
1118 *s++ = '%';
1119 break;
1120 default:
1121 appendstring(p);
1122 goto end;
1123 }
1124 } else
1125 *s++ = *f;
1126 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001127
Benjamin Peterson29060642009-01-31 22:14:21 +00001128 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 if (callresults)
1130 PyObject_Free(callresults);
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1134 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001135 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 if (callresults) {
1137 PyObject **callresult2 = callresults;
1138 while (callresult2 < callresult) {
1139 Py_DECREF(*callresult2);
1140 ++callresult2;
1141 }
1142 PyObject_Free(callresults);
1143 }
1144 if (abuffer)
1145 PyObject_Free(abuffer);
1146 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
1149#undef appendstring
1150
1151PyObject *
1152PyUnicode_FromFormat(const char *format, ...)
1153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 PyObject* ret;
1155 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
1157#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001161#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 ret = PyUnicode_FromFormatV(format, vargs);
1163 va_end(vargs);
1164 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001165}
1166
Martin v. Löwis18e16552006-02-15 17:27:45 +00001167Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 wchar_t *w,
1169 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170{
1171 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001175
1176 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180#ifdef HAVE_USABLE_WCHAR_T
1181 memcpy(w, unicode->str, size * sizeof(wchar_t));
1182#else
1183 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 register Py_UNICODE *u;
1185 register Py_ssize_t i;
1186 u = PyUnicode_AS_UNICODE(unicode);
1187 for (i = size; i > 0; i--)
1188 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 }
1190#endif
1191
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001192 if (size > PyUnicode_GET_SIZE(unicode))
1193 return PyUnicode_GET_SIZE(unicode);
1194 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001195 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196}
1197
1198#endif
1199
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001200PyObject *PyUnicode_FromOrdinal(int ordinal)
1201{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001202 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001204 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001205 PyErr_SetString(PyExc_ValueError,
1206 "chr() arg not in range(0x110000)");
1207 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001208 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001209
1210#ifndef Py_UNICODE_WIDE
1211 if (ordinal > 0xffff) {
1212 ordinal -= 0x10000;
1213 s[0] = 0xD800 | (ordinal >> 10);
1214 s[1] = 0xDC00 | (ordinal & 0x3FF);
1215 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001216 }
1217#endif
1218
Hye-Shik Chang40574832004-04-06 07:24:51 +00001219 s[0] = (Py_UNICODE)ordinal;
1220 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_FromObject(register PyObject *obj)
1224{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001227 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 Py_INCREF(obj);
1229 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
1231 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 /* For a Unicode subtype that's not a Unicode object,
1233 return a true Unicode object with the same data. */
1234 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1235 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001236 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001237 PyErr_Format(PyExc_TypeError,
1238 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001239 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001241}
1242
1243PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 const char *encoding,
1245 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001246{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001247 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001250
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001252 PyErr_BadInternalCall();
1253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001255
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 PyErr_SetString(PyExc_TypeError,
1258 "decoding str is not supported");
1259 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001260 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261
1262 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001263 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001264 s = PyBytes_AS_STRING(obj);
1265 len = PyBytes_GET_SIZE(obj);
1266 }
1267 else if (PyByteArray_Check(obj)) {
1268 s = PyByteArray_AS_STRING(obj);
1269 len = PyByteArray_GET_SIZE(obj);
1270 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001271 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 /* Overwrite the error message with something more useful in
1273 case of a TypeError. */
1274 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001275 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 "coercing to str: need string or buffer, "
1277 "%.80s found",
1278 Py_TYPE(obj)->tp_name);
1279 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001282 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 Py_INCREF(unicode_empty);
1285 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 }
Tim Petersced69f82003-09-16 20:30:58 +00001287 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001289
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001290 return v;
1291
Benjamin Peterson29060642009-01-31 22:14:21 +00001292 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
1296PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 Py_ssize_t size,
1298 const char *encoding,
1299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001302 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001303 char lower[20]; /* Enough for any encoding name we recognize */
1304 char *l;
1305 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306
1307 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001308 encoding = PyUnicode_GetDefaultEncoding();
1309
1310 /* Convert encoding to lower case and replace '_' with '-' in order to
1311 catch e.g. UTF_8 */
1312 e = encoding;
1313 l = lower;
1314 while (*e && l < &lower[(sizeof lower) - 2]) {
1315 if (ISUPPER(*e)) {
1316 *l++ = TOLOWER(*e++);
1317 }
1318 else if (*e == '_') {
1319 *l++ = '-';
1320 e++;
1321 }
1322 else {
1323 *l++ = *e++;
1324 }
1325 }
1326 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001327
1328 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001329 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001331 else if ((strcmp(lower, "latin-1") == 0) ||
1332 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001333 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001334#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001335 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001336 return PyUnicode_DecodeMBCS(s, size, errors);
1337#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001338 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001339 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001340 else if (strcmp(lower, "utf-16") == 0)
1341 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1342 else if (strcmp(lower, "utf-32") == 0)
1343 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344
1345 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001346 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001347 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001348 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001349 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 if (buffer == NULL)
1351 goto onError;
1352 unicode = PyCodec_Decode(buffer, encoding, errors);
1353 if (unicode == NULL)
1354 goto onError;
1355 if (!PyUnicode_Check(unicode)) {
1356 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001357 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001358 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 Py_DECREF(unicode);
1360 goto onError;
1361 }
1362 Py_DECREF(buffer);
1363 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001364
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 Py_XDECREF(buffer);
1367 return NULL;
1368}
1369
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001370PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1371 const char *encoding,
1372 const char *errors)
1373{
1374 PyObject *v;
1375
1376 if (!PyUnicode_Check(unicode)) {
1377 PyErr_BadArgument();
1378 goto onError;
1379 }
1380
1381 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001382 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001383
1384 /* Decode via the codec registry */
1385 v = PyCodec_Decode(unicode, encoding, errors);
1386 if (v == NULL)
1387 goto onError;
1388 return v;
1389
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001391 return NULL;
1392}
1393
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001394PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1395 const char *encoding,
1396 const char *errors)
1397{
1398 PyObject *v;
1399
1400 if (!PyUnicode_Check(unicode)) {
1401 PyErr_BadArgument();
1402 goto onError;
1403 }
1404
1405 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001406 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001407
1408 /* Decode via the codec registry */
1409 v = PyCodec_Decode(unicode, encoding, errors);
1410 if (v == NULL)
1411 goto onError;
1412 if (!PyUnicode_Check(v)) {
1413 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001414 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001415 Py_TYPE(v)->tp_name);
1416 Py_DECREF(v);
1417 goto onError;
1418 }
1419 return v;
1420
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001422 return NULL;
1423}
1424
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001426 Py_ssize_t size,
1427 const char *encoding,
1428 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
1430 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001431
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432 unicode = PyUnicode_FromUnicode(s, size);
1433 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1436 Py_DECREF(unicode);
1437 return v;
1438}
1439
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001440PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1441 const char *encoding,
1442 const char *errors)
1443{
1444 PyObject *v;
1445
1446 if (!PyUnicode_Check(unicode)) {
1447 PyErr_BadArgument();
1448 goto onError;
1449 }
1450
1451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001453
1454 /* Encode via the codec registry */
1455 v = PyCodec_Encode(unicode, encoding, errors);
1456 if (v == NULL)
1457 goto onError;
1458 return v;
1459
Benjamin Peterson29060642009-01-31 22:14:21 +00001460 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001461 return NULL;
1462}
1463
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1465 const char *encoding,
1466 const char *errors)
1467{
1468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001469
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470 if (!PyUnicode_Check(unicode)) {
1471 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473 }
Fred Drakee4315f52000-05-09 19:53:39 +00001474
Tim Petersced69f82003-09-16 20:30:58 +00001475 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001476 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001477
1478 /* Shortcuts for common default encodings */
1479 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 if (strcmp(encoding, "utf-8") == 0)
1481 return PyUnicode_AsUTF8String(unicode);
1482 else if (strcmp(encoding, "latin-1") == 0)
1483 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001484#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 else if (strcmp(encoding, "mbcs") == 0)
1486 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001487#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 else if (strcmp(encoding, "ascii") == 0)
1489 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001490 /* During bootstrap, we may need to find the encodings
1491 package, to load the file system encoding, and require the
1492 file system encoding in order to load the encodings
1493 package.
1494
1495 Break out of this dependency by assuming that the path to
1496 the encodings module is ASCII-only. XXX could try wcstombs
1497 instead, if the file system encoding is the locale's
1498 encoding. */
1499 else if (Py_FileSystemDefaultEncoding &&
1500 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1501 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001502 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504
1505 /* Encode via the codec registry */
1506 v = PyCodec_Encode(unicode, encoding, errors);
1507 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001508 return NULL;
1509
1510 /* The normal path */
1511 if (PyBytes_Check(v))
1512 return v;
1513
1514 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001515 if (PyByteArray_Check(v)) {
1516 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001517 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001518 PyOS_snprintf(msg, sizeof(msg),
1519 "encoder %s returned buffer instead of bytes",
1520 encoding);
1521 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001522 Py_DECREF(v);
1523 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001524 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001525
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001526 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1527 Py_DECREF(v);
1528 return b;
1529 }
1530
1531 PyErr_Format(PyExc_TypeError,
1532 "encoder did not return a bytes object (type=%.400s)",
1533 Py_TYPE(v)->tp_name);
1534 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001535 return NULL;
1536}
1537
1538PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1539 const char *encoding,
1540 const char *errors)
1541{
1542 PyObject *v;
1543
1544 if (!PyUnicode_Check(unicode)) {
1545 PyErr_BadArgument();
1546 goto onError;
1547 }
1548
1549 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001550 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551
1552 /* Encode via the codec registry */
1553 v = PyCodec_Encode(unicode, encoding, errors);
1554 if (v == NULL)
1555 goto onError;
1556 if (!PyUnicode_Check(v)) {
1557 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001558 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001559 Py_TYPE(v)->tp_name);
1560 Py_DECREF(v);
1561 goto onError;
1562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001564
Benjamin Peterson29060642009-01-31 22:14:21 +00001565 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 return NULL;
1567}
1568
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001569PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001571{
1572 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001573 if (v)
1574 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001575 if (errors != NULL)
1576 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001577 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001578 PyUnicode_GET_SIZE(unicode),
1579 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001580 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001581 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001582 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001583 return v;
1584}
1585
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001586PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001587PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001588 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001589 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1590}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001591
Christian Heimes5894ba72007-11-04 11:43:14 +00001592PyObject*
1593PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1594{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001595 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1596 can be undefined. If it is case, decode using UTF-8. The following assumes
1597 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1598 bootstrapping process where the codecs aren't ready yet.
1599 */
1600 if (Py_FileSystemDefaultEncoding) {
1601#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001602 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001603 return PyUnicode_DecodeMBCS(s, size, "replace");
1604 }
1605#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001606 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001607 return PyUnicode_DecodeUTF8(s, size, "replace");
1608 }
1609#endif
1610 return PyUnicode_Decode(s, size,
1611 Py_FileSystemDefaultEncoding,
1612 "replace");
1613 }
1614 else {
1615 return PyUnicode_DecodeUTF8(s, size, "replace");
1616 }
1617}
1618
Martin v. Löwis011e8422009-05-05 04:43:17 +00001619/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001620 system encoding. The addr param must be a PyObject**.
1621 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001622
1623int
1624PyUnicode_FSConverter(PyObject* arg, void* addr)
1625{
1626 PyObject *output = NULL;
1627 Py_ssize_t size;
1628 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001629 if (arg == NULL) {
1630 Py_DECREF(*(PyObject**)addr);
1631 return 1;
1632 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001633 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001634 output = arg;
1635 Py_INCREF(output);
1636 }
1637 else {
1638 arg = PyUnicode_FromObject(arg);
1639 if (!arg)
1640 return 0;
1641 output = PyUnicode_AsEncodedObject(arg,
1642 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001643 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001644 Py_DECREF(arg);
1645 if (!output)
1646 return 0;
1647 if (!PyBytes_Check(output)) {
1648 Py_DECREF(output);
1649 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1650 return 0;
1651 }
1652 }
1653 if (PyBytes_Check(output)) {
1654 size = PyBytes_GET_SIZE(output);
1655 data = PyBytes_AS_STRING(output);
1656 }
1657 else {
1658 size = PyByteArray_GET_SIZE(output);
1659 data = PyByteArray_AS_STRING(output);
1660 }
1661 if (size != strlen(data)) {
1662 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1663 Py_DECREF(output);
1664 return 0;
1665 }
1666 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001667 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001668}
1669
1670
Martin v. Löwis5b222132007-06-10 09:51:05 +00001671char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001672_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001673{
Christian Heimesf3863112007-11-22 07:46:41 +00001674 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001675 if (!PyUnicode_Check(unicode)) {
1676 PyErr_BadArgument();
1677 return NULL;
1678 }
Christian Heimesf3863112007-11-22 07:46:41 +00001679 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1680 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001681 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001682 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001683 *psize = PyBytes_GET_SIZE(bytes);
1684 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001685}
1686
1687char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001688_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001689{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001690 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001691}
1692
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1694{
1695 if (!PyUnicode_Check(unicode)) {
1696 PyErr_BadArgument();
1697 goto onError;
1698 }
1699 return PyUnicode_AS_UNICODE(unicode);
1700
Benjamin Peterson29060642009-01-31 22:14:21 +00001701 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702 return NULL;
1703}
1704
Martin v. Löwis18e16552006-02-15 17:27:45 +00001705Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706{
1707 if (!PyUnicode_Check(unicode)) {
1708 PyErr_BadArgument();
1709 goto onError;
1710 }
1711 return PyUnicode_GET_SIZE(unicode);
1712
Benjamin Peterson29060642009-01-31 22:14:21 +00001713 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 return -1;
1715}
1716
Thomas Wouters78890102000-07-22 19:25:51 +00001717const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001718{
1719 return unicode_default_encoding;
1720}
1721
1722int PyUnicode_SetDefaultEncoding(const char *encoding)
1723{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001724 if (strcmp(encoding, unicode_default_encoding) != 0) {
1725 PyErr_Format(PyExc_ValueError,
1726 "Can only set default encoding to %s",
1727 unicode_default_encoding);
1728 return -1;
1729 }
Fred Drakee4315f52000-05-09 19:53:39 +00001730 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001731}
1732
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001733/* error handling callback helper:
1734 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001735 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001736 and adjust various state variables.
1737 return 0 on success, -1 on error
1738*/
1739
1740static
1741int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001742 const char *encoding, const char *reason,
1743 const char **input, const char **inend, Py_ssize_t *startinpos,
1744 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1745 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001747 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748
1749 PyObject *restuple = NULL;
1750 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001751 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001752 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001753 Py_ssize_t requiredsize;
1754 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001755 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001756 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001757 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001758 int res = -1;
1759
1760 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001761 *errorHandler = PyCodec_LookupError(errors);
1762 if (*errorHandler == NULL)
1763 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001764 }
1765
1766 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001767 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001768 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1769 if (*exceptionObject == NULL)
1770 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001771 }
1772 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001773 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1774 goto onError;
1775 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1776 goto onError;
1777 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1778 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 }
1780
1781 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1782 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001784 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001785 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001787 }
1788 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001789 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001790
1791 /* Copy back the bytes variables, which might have been modified by the
1792 callback */
1793 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1794 if (!inputobj)
1795 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001796 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001797 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001798 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001799 *input = PyBytes_AS_STRING(inputobj);
1800 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001801 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001802 /* we can DECREF safely, as the exception has another reference,
1803 so the object won't go away. */
1804 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001805
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001807 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001808 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001809 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1810 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001811 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812
1813 /* need more space? (at least enough for what we
1814 have+the replacement+the rest of the string (starting
1815 at the new input position), so we won't have to check space
1816 when there are no errors in the rest of the string) */
1817 repptr = PyUnicode_AS_UNICODE(repunicode);
1818 repsize = PyUnicode_GET_SIZE(repunicode);
1819 requiredsize = *outpos + repsize + insize-newpos;
1820 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001821 if (requiredsize<2*outsize)
1822 requiredsize = 2*outsize;
1823 if (_PyUnicode_Resize(output, requiredsize) < 0)
1824 goto onError;
1825 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 }
1827 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001828 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829 Py_UNICODE_COPY(*outptr, repptr, repsize);
1830 *outptr += repsize;
1831 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 /* we made it! */
1834 res = 0;
1835
Benjamin Peterson29060642009-01-31 22:14:21 +00001836 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 Py_XDECREF(restuple);
1838 return res;
1839}
1840
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001841/* --- UTF-7 Codec -------------------------------------------------------- */
1842
Antoine Pitrou244651a2009-05-04 18:56:13 +00001843/* See RFC2152 for details. We encode conservatively and decode liberally. */
1844
1845/* Three simple macros defining base-64. */
1846
1847/* Is c a base-64 character? */
1848
1849#define IS_BASE64(c) \
1850 (((c) >= 'A' && (c) <= 'Z') || \
1851 ((c) >= 'a' && (c) <= 'z') || \
1852 ((c) >= '0' && (c) <= '9') || \
1853 (c) == '+' || (c) == '/')
1854
1855/* given that c is a base-64 character, what is its base-64 value? */
1856
1857#define FROM_BASE64(c) \
1858 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1859 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1860 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1861 (c) == '+' ? 62 : 63)
1862
1863/* What is the base-64 character of the bottom 6 bits of n? */
1864
1865#define TO_BASE64(n) \
1866 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1867
1868/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1869 * decoded as itself. We are permissive on decoding; the only ASCII
1870 * byte not decoding to itself is the + which begins a base64
1871 * string. */
1872
1873#define DECODE_DIRECT(c) \
1874 ((c) <= 127 && (c) != '+')
1875
1876/* The UTF-7 encoder treats ASCII characters differently according to
1877 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1878 * the above). See RFC2152. This array identifies these different
1879 * sets:
1880 * 0 : "Set D"
1881 * alphanumeric and '(),-./:?
1882 * 1 : "Set O"
1883 * !"#$%&*;<=>@[]^_`{|}
1884 * 2 : "whitespace"
1885 * ht nl cr sp
1886 * 3 : special (must be base64 encoded)
1887 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1888 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001889
Tim Petersced69f82003-09-16 20:30:58 +00001890static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001891char utf7_category[128] = {
1892/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1893 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1894/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1895 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1896/* sp ! " # $ % & ' ( ) * + , - . / */
1897 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1898/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1900/* @ A B C D E F G H I J K L M N O */
1901 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1902/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1904/* ` a b c d e f g h i j k l m n o */
1905 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1906/* p q r s t u v w x y z { | } ~ del */
1907 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001908};
1909
Antoine Pitrou244651a2009-05-04 18:56:13 +00001910/* ENCODE_DIRECT: this character should be encoded as itself. The
1911 * answer depends on whether we are encoding set O as itself, and also
1912 * on whether we are encoding whitespace as itself. RFC2152 makes it
1913 * clear that the answers to these questions vary between
1914 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001915
Antoine Pitrou244651a2009-05-04 18:56:13 +00001916#define ENCODE_DIRECT(c, directO, directWS) \
1917 ((c) < 128 && (c) > 0 && \
1918 ((utf7_category[(c)] == 0) || \
1919 (directWS && (utf7_category[(c)] == 2)) || \
1920 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001921
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001922PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001923 Py_ssize_t size,
1924 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001925{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001926 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1927}
1928
Antoine Pitrou244651a2009-05-04 18:56:13 +00001929/* The decoder. The only state we preserve is our read position,
1930 * i.e. how many characters we have consumed. So if we end in the
1931 * middle of a shift sequence we have to back off the read position
1932 * and the output to the beginning of the sequence, otherwise we lose
1933 * all the shift state (seen bits, number of bits seen, high
1934 * surrogate). */
1935
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001936PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001937 Py_ssize_t size,
1938 const char *errors,
1939 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001940{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001941 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001942 Py_ssize_t startinpos;
1943 Py_ssize_t endinpos;
1944 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001945 const char *e;
1946 PyUnicodeObject *unicode;
1947 Py_UNICODE *p;
1948 const char *errmsg = "";
1949 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001950 Py_UNICODE *shiftOutStart;
1951 unsigned int base64bits = 0;
1952 unsigned long base64buffer = 0;
1953 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001954 PyObject *errorHandler = NULL;
1955 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956
1957 unicode = _PyUnicode_New(size);
1958 if (!unicode)
1959 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001960 if (size == 0) {
1961 if (consumed)
1962 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001963 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001964 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965
1966 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001968 e = s + size;
1969
1970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001972 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001973 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001974
Antoine Pitrou244651a2009-05-04 18:56:13 +00001975 if (inShift) { /* in a base-64 section */
1976 if (IS_BASE64(ch)) { /* consume a base-64 character */
1977 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1978 base64bits += 6;
1979 s++;
1980 if (base64bits >= 16) {
1981 /* we have enough bits for a UTF-16 value */
1982 Py_UNICODE outCh = (Py_UNICODE)
1983 (base64buffer >> (base64bits-16));
1984 base64bits -= 16;
1985 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1986 if (surrogate) {
1987 /* expecting a second surrogate */
1988 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1989#ifdef Py_UNICODE_WIDE
1990 *p++ = (((surrogate & 0x3FF)<<10)
1991 | (outCh & 0x3FF)) + 0x10000;
1992#else
1993 *p++ = surrogate;
1994 *p++ = outCh;
1995#endif
1996 surrogate = 0;
1997 }
1998 else {
1999 surrogate = 0;
2000 errmsg = "second surrogate missing";
2001 goto utf7Error;
2002 }
2003 }
2004 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2005 /* first surrogate */
2006 surrogate = outCh;
2007 }
2008 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2009 errmsg = "unexpected second surrogate";
2010 goto utf7Error;
2011 }
2012 else {
2013 *p++ = outCh;
2014 }
2015 }
2016 }
2017 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002018 inShift = 0;
2019 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002020 if (surrogate) {
2021 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002022 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002023 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002024 if (base64bits > 0) { /* left-over bits */
2025 if (base64bits >= 6) {
2026 /* We've seen at least one base-64 character */
2027 errmsg = "partial character in shift sequence";
2028 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002030 else {
2031 /* Some bits remain; they should be zero */
2032 if (base64buffer != 0) {
2033 errmsg = "non-zero padding bits in shift sequence";
2034 goto utf7Error;
2035 }
2036 }
2037 }
2038 if (ch != '-') {
2039 /* '-' is absorbed; other terminating
2040 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002041 *p++ = ch;
2042 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002043 }
2044 }
2045 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002046 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002047 s++; /* consume '+' */
2048 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002049 s++;
2050 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002051 }
2052 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002053 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002054 shiftOutStart = p;
2055 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002056 }
2057 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002058 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002059 *p++ = ch;
2060 s++;
2061 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002062 else {
2063 startinpos = s-starts;
2064 s++;
2065 errmsg = "unexpected special character";
2066 goto utf7Error;
2067 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002068 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002069utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 outpos = p-PyUnicode_AS_UNICODE(unicode);
2071 endinpos = s-starts;
2072 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002073 errors, &errorHandler,
2074 "utf7", errmsg,
2075 &starts, &e, &startinpos, &endinpos, &exc, &s,
2076 &unicode, &outpos, &p))
2077 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078 }
2079
Antoine Pitrou244651a2009-05-04 18:56:13 +00002080 /* end of string */
2081
2082 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2083 /* if we're in an inconsistent state, that's an error */
2084 if (surrogate ||
2085 (base64bits >= 6) ||
2086 (base64bits > 0 && base64buffer != 0)) {
2087 outpos = p-PyUnicode_AS_UNICODE(unicode);
2088 endinpos = size;
2089 if (unicode_decode_call_errorhandler(
2090 errors, &errorHandler,
2091 "utf7", "unterminated shift sequence",
2092 &starts, &e, &startinpos, &endinpos, &exc, &s,
2093 &unicode, &outpos, &p))
2094 goto onError;
2095 if (s < e)
2096 goto restart;
2097 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002098 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002099
2100 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002101 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002102 if (inShift) {
2103 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002104 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002105 }
2106 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002107 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002108 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002109 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002110
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002111 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002112 goto onError;
2113
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002114 Py_XDECREF(errorHandler);
2115 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002116 return (PyObject *)unicode;
2117
Benjamin Peterson29060642009-01-31 22:14:21 +00002118 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 Py_XDECREF(errorHandler);
2120 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002121 Py_DECREF(unicode);
2122 return NULL;
2123}
2124
2125
2126PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002127 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002128 int base64SetO,
2129 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002130 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002131{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002132 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002134 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002136 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137 unsigned int base64bits = 0;
2138 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002139 char * out;
2140 char * start;
2141
2142 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002143 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002144
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002145 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002146 return PyErr_NoMemory();
2147
Antoine Pitrou244651a2009-05-04 18:56:13 +00002148 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002149 if (v == NULL)
2150 return NULL;
2151
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002152 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002153 for (;i < size; ++i) {
2154 Py_UNICODE ch = s[i];
2155
Antoine Pitrou244651a2009-05-04 18:56:13 +00002156 if (inShift) {
2157 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2158 /* shifting out */
2159 if (base64bits) { /* output remaining bits */
2160 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2161 base64buffer = 0;
2162 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163 }
2164 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002165 /* Characters not in the BASE64 set implicitly unshift the sequence
2166 so no '-' is required, except if the character is itself a '-' */
2167 if (IS_BASE64(ch) || ch == '-') {
2168 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002170 *out++ = (char) ch;
2171 }
2172 else {
2173 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002174 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002175 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002176 else { /* not in a shift sequence */
2177 if (ch == '+') {
2178 *out++ = '+';
2179 *out++ = '-';
2180 }
2181 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2182 *out++ = (char) ch;
2183 }
2184 else {
2185 *out++ = '+';
2186 inShift = 1;
2187 goto encode_char;
2188 }
2189 }
2190 continue;
2191encode_char:
2192#ifdef Py_UNICODE_WIDE
2193 if (ch >= 0x10000) {
2194 /* code first surrogate */
2195 base64bits += 16;
2196 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2197 while (base64bits >= 6) {
2198 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2199 base64bits -= 6;
2200 }
2201 /* prepare second surrogate */
2202 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2203 }
2204#endif
2205 base64bits += 16;
2206 base64buffer = (base64buffer << 16) | ch;
2207 while (base64bits >= 6) {
2208 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2209 base64bits -= 6;
2210 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002211 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002212 if (base64bits)
2213 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2214 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002215 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002216 if (_PyBytes_Resize(&v, out - start) < 0)
2217 return NULL;
2218 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002219}
2220
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221#undef IS_BASE64
2222#undef FROM_BASE64
2223#undef TO_BASE64
2224#undef DECODE_DIRECT
2225#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227/* --- UTF-8 Codec -------------------------------------------------------- */
2228
Tim Petersced69f82003-09-16 20:30:58 +00002229static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230char utf8_code_length[256] = {
2231 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2232 illegal prefix. see RFC 2279 for details */
2233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2243 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2244 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2245 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2246 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2247 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2248 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2249};
2250
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002252 Py_ssize_t size,
2253 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254{
Walter Dörwald69652032004-09-07 20:24:22 +00002255 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2256}
2257
Antoine Pitrouab868312009-01-10 15:40:25 +00002258/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2259#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2260
2261/* Mask to quickly check whether a C 'long' contains a
2262 non-ASCII, UTF8-encoded char. */
2263#if (SIZEOF_LONG == 8)
2264# define ASCII_CHAR_MASK 0x8080808080808080L
2265#elif (SIZEOF_LONG == 4)
2266# define ASCII_CHAR_MASK 0x80808080L
2267#else
2268# error C 'long' size should be either 4 or 8!
2269#endif
2270
Walter Dörwald69652032004-09-07 20:24:22 +00002271PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002272 Py_ssize_t size,
2273 const char *errors,
2274 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002275{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002278 Py_ssize_t startinpos;
2279 Py_ssize_t endinpos;
2280 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002281 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 PyUnicodeObject *unicode;
2283 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002284 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002285 PyObject *errorHandler = NULL;
2286 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287
2288 /* Note: size will always be longer than the resulting Unicode
2289 character count */
2290 unicode = _PyUnicode_New(size);
2291 if (!unicode)
2292 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002293 if (size == 0) {
2294 if (consumed)
2295 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298
2299 /* Unpack UTF-8 encoded data */
2300 p = unicode->str;
2301 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002302 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303
2304 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002305 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306
2307 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002308 /* Fast path for runs of ASCII characters. Given that common UTF-8
2309 input will consist of an overwhelming majority of ASCII
2310 characters, we try to optimize for this case by checking
2311 as many characters as a C 'long' can contain.
2312 First, check if we can do an aligned read, as most CPUs have
2313 a penalty for unaligned reads.
2314 */
2315 if (!((size_t) s & LONG_PTR_MASK)) {
2316 /* Help register allocation */
2317 register const char *_s = s;
2318 register Py_UNICODE *_p = p;
2319 while (_s < aligned_end) {
2320 /* Read a whole long at a time (either 4 or 8 bytes),
2321 and do a fast unrolled copy if it only contains ASCII
2322 characters. */
2323 unsigned long data = *(unsigned long *) _s;
2324 if (data & ASCII_CHAR_MASK)
2325 break;
2326 _p[0] = (unsigned char) _s[0];
2327 _p[1] = (unsigned char) _s[1];
2328 _p[2] = (unsigned char) _s[2];
2329 _p[3] = (unsigned char) _s[3];
2330#if (SIZEOF_LONG == 8)
2331 _p[4] = (unsigned char) _s[4];
2332 _p[5] = (unsigned char) _s[5];
2333 _p[6] = (unsigned char) _s[6];
2334 _p[7] = (unsigned char) _s[7];
2335#endif
2336 _s += SIZEOF_LONG;
2337 _p += SIZEOF_LONG;
2338 }
2339 s = _s;
2340 p = _p;
2341 if (s == e)
2342 break;
2343 ch = (unsigned char)*s;
2344 }
2345 }
2346
2347 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002348 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 s++;
2350 continue;
2351 }
2352
2353 n = utf8_code_length[ch];
2354
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002355 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002356 if (consumed)
2357 break;
2358 else {
2359 errmsg = "unexpected end of data";
2360 startinpos = s-starts;
2361 endinpos = size;
2362 goto utf8Error;
2363 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365
2366 switch (n) {
2367
2368 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002369 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002370 startinpos = s-starts;
2371 endinpos = startinpos+1;
2372 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
2374 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002375 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 startinpos = s-starts;
2377 endinpos = startinpos+1;
2378 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379
2380 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002381 if ((s[1] & 0xc0) != 0x80) {
2382 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002383 startinpos = s-starts;
2384 endinpos = startinpos+2;
2385 goto utf8Error;
2386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002388 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002389 startinpos = s-starts;
2390 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002391 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002392 goto utf8Error;
2393 }
2394 else
2395 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396 break;
2397
2398 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002399 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 (s[2] & 0xc0) != 0x80) {
2401 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 startinpos = s-starts;
2403 endinpos = startinpos+3;
2404 goto utf8Error;
2405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002407 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002408 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002409 startinpos = s-starts;
2410 endinpos = startinpos+3;
2411 goto utf8Error;
2412 }
2413 else
2414 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002415 break;
2416
2417 case 4:
2418 if ((s[1] & 0xc0) != 0x80 ||
2419 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002420 (s[3] & 0xc0) != 0x80) {
2421 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002422 startinpos = s-starts;
2423 endinpos = startinpos+4;
2424 goto utf8Error;
2425 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002426 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002428 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002429 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002430 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002431 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002432 UTF-16 */
2433 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002434 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002435 startinpos = s-starts;
2436 endinpos = startinpos+4;
2437 goto utf8Error;
2438 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002439#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002440 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002441#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002442 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002443
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002444 /* translate from 10000..10FFFF to 0..FFFF */
2445 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002446
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002447 /* high surrogate = top 10 bits added to D800 */
2448 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002449
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002450 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002451 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002452#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 break;
2454
2455 default:
2456 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002458 startinpos = s-starts;
2459 endinpos = startinpos+n;
2460 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 }
2462 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002463 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002464
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 utf8Error:
2466 outpos = p-PyUnicode_AS_UNICODE(unicode);
2467 if (unicode_decode_call_errorhandler(
2468 errors, &errorHandler,
2469 "utf8", errmsg,
2470 &starts, &e, &startinpos, &endinpos, &exc, &s,
2471 &unicode, &outpos, &p))
2472 goto onError;
2473 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 }
Walter Dörwald69652032004-09-07 20:24:22 +00002475 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477
2478 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002479 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 goto onError;
2481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 return (PyObject *)unicode;
2485
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 Py_XDECREF(errorHandler);
2488 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489 Py_DECREF(unicode);
2490 return NULL;
2491}
2492
Antoine Pitrouab868312009-01-10 15:40:25 +00002493#undef ASCII_CHAR_MASK
2494
2495
Tim Peters602f7402002-04-27 18:03:26 +00002496/* Allocation strategy: if the string is short, convert into a stack buffer
2497 and allocate exactly as much space needed at the end. Else allocate the
2498 maximum possible needed (4 result bytes per Unicode character), and return
2499 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002500*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002501PyObject *
2502PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002503 Py_ssize_t size,
2504 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505{
Tim Peters602f7402002-04-27 18:03:26 +00002506#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002507
Guido van Rossum98297ee2007-11-06 21:34:58 +00002508 Py_ssize_t i; /* index into s of next input byte */
2509 PyObject *result; /* result string object */
2510 char *p; /* next free byte in output buffer */
2511 Py_ssize_t nallocated; /* number of result bytes allocated */
2512 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002513 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002514 PyObject *errorHandler = NULL;
2515 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002516
Tim Peters602f7402002-04-27 18:03:26 +00002517 assert(s != NULL);
2518 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519
Tim Peters602f7402002-04-27 18:03:26 +00002520 if (size <= MAX_SHORT_UNICHARS) {
2521 /* Write into the stack buffer; nallocated can't overflow.
2522 * At the end, we'll allocate exactly as much heap space as it
2523 * turns out we need.
2524 */
2525 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002526 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002527 p = stackbuf;
2528 }
2529 else {
2530 /* Overallocate on the heap, and give the excess back at the end. */
2531 nallocated = size * 4;
2532 if (nallocated / 4 != size) /* overflow! */
2533 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002534 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002535 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002536 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002537 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002538 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002539
Tim Peters602f7402002-04-27 18:03:26 +00002540 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002541 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002542
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002543 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002544 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002546
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002548 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002549 *p++ = (char)(0xc0 | (ch >> 6));
2550 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002551 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002552#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002553 /* Special case: check for high and low surrogate */
2554 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2555 Py_UCS4 ch2 = s[i];
2556 /* Combine the two surrogates to form a UCS4 value */
2557 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2558 i++;
2559
2560 /* Encode UCS4 Unicode ordinals */
2561 *p++ = (char)(0xf0 | (ch >> 18));
2562 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002563 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2564 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002565
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002566#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002567 } else {
2568 Py_ssize_t newpos;
2569 PyObject *rep;
2570 Py_ssize_t repsize, k;
2571 rep = unicode_encode_call_errorhandler
2572 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2573 s, size, &exc, i-1, i, &newpos);
2574 if (!rep)
2575 goto error;
2576
2577 if (PyBytes_Check(rep))
2578 repsize = PyBytes_GET_SIZE(rep);
2579 else
2580 repsize = PyUnicode_GET_SIZE(rep);
2581
2582 if (repsize > 4) {
2583 Py_ssize_t offset;
2584
2585 if (result == NULL)
2586 offset = p - stackbuf;
2587 else
2588 offset = p - PyBytes_AS_STRING(result);
2589
2590 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2591 /* integer overflow */
2592 PyErr_NoMemory();
2593 goto error;
2594 }
2595 nallocated += repsize - 4;
2596 if (result != NULL) {
2597 if (_PyBytes_Resize(&result, nallocated) < 0)
2598 goto error;
2599 } else {
2600 result = PyBytes_FromStringAndSize(NULL, nallocated);
2601 if (result == NULL)
2602 goto error;
2603 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2604 }
2605 p = PyBytes_AS_STRING(result) + offset;
2606 }
2607
2608 if (PyBytes_Check(rep)) {
2609 char *prep = PyBytes_AS_STRING(rep);
2610 for(k = repsize; k > 0; k--)
2611 *p++ = *prep++;
2612 } else /* rep is unicode */ {
2613 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2614 Py_UNICODE c;
2615
2616 for(k=0; k<repsize; k++) {
2617 c = prep[k];
2618 if (0x80 <= c) {
2619 raise_encode_exception(&exc, "utf-8", s, size,
2620 i-1, i, "surrogates not allowed");
2621 goto error;
2622 }
2623 *p++ = (char)prep[k];
2624 }
2625 }
2626 Py_DECREF(rep);
2627 }
2628 } else if (ch < 0x10000) {
2629 *p++ = (char)(0xe0 | (ch >> 12));
2630 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2631 *p++ = (char)(0x80 | (ch & 0x3f));
2632 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002633 /* Encode UCS4 Unicode ordinals */
2634 *p++ = (char)(0xf0 | (ch >> 18));
2635 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2636 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2637 *p++ = (char)(0x80 | (ch & 0x3f));
2638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002640
Guido van Rossum98297ee2007-11-06 21:34:58 +00002641 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002642 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002643 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002644 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002645 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002646 }
2647 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002648 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002649 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002650 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002651 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002652 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002653 Py_XDECREF(errorHandler);
2654 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002655 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002656 error:
2657 Py_XDECREF(errorHandler);
2658 Py_XDECREF(exc);
2659 Py_XDECREF(result);
2660 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002661
Tim Peters602f7402002-04-27 18:03:26 +00002662#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663}
2664
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2666{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002667 if (!PyUnicode_Check(unicode)) {
2668 PyErr_BadArgument();
2669 return NULL;
2670 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002671 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002672 PyUnicode_GET_SIZE(unicode),
2673 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674}
2675
Walter Dörwald41980ca2007-08-16 21:55:45 +00002676/* --- UTF-32 Codec ------------------------------------------------------- */
2677
2678PyObject *
2679PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 Py_ssize_t size,
2681 const char *errors,
2682 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002683{
2684 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2685}
2686
2687PyObject *
2688PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 Py_ssize_t size,
2690 const char *errors,
2691 int *byteorder,
2692 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002693{
2694 const char *starts = s;
2695 Py_ssize_t startinpos;
2696 Py_ssize_t endinpos;
2697 Py_ssize_t outpos;
2698 PyUnicodeObject *unicode;
2699 Py_UNICODE *p;
2700#ifndef Py_UNICODE_WIDE
2701 int i, pairs;
2702#else
2703 const int pairs = 0;
2704#endif
2705 const unsigned char *q, *e;
2706 int bo = 0; /* assume native ordering by default */
2707 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002708 /* Offsets from q for retrieving bytes in the right order. */
2709#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2710 int iorder[] = {0, 1, 2, 3};
2711#else
2712 int iorder[] = {3, 2, 1, 0};
2713#endif
2714 PyObject *errorHandler = NULL;
2715 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002716 /* On narrow builds we split characters outside the BMP into two
2717 codepoints => count how much extra space we need. */
2718#ifndef Py_UNICODE_WIDE
2719 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002720 if (((Py_UCS4 *)s)[i] >= 0x10000)
2721 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002722#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002723
2724 /* This might be one to much, because of a BOM */
2725 unicode = _PyUnicode_New((size+3)/4+pairs);
2726 if (!unicode)
2727 return NULL;
2728 if (size == 0)
2729 return (PyObject *)unicode;
2730
2731 /* Unpack UTF-32 encoded data */
2732 p = unicode->str;
2733 q = (unsigned char *)s;
2734 e = q + size;
2735
2736 if (byteorder)
2737 bo = *byteorder;
2738
2739 /* Check for BOM marks (U+FEFF) in the input and adjust current
2740 byte order setting accordingly. In native mode, the leading BOM
2741 mark is skipped, in all other modes, it is copied to the output
2742 stream as-is (giving a ZWNBSP character). */
2743 if (bo == 0) {
2744 if (size >= 4) {
2745 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002746 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002747#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 if (bom == 0x0000FEFF) {
2749 q += 4;
2750 bo = -1;
2751 }
2752 else if (bom == 0xFFFE0000) {
2753 q += 4;
2754 bo = 1;
2755 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002756#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002757 if (bom == 0x0000FEFF) {
2758 q += 4;
2759 bo = 1;
2760 }
2761 else if (bom == 0xFFFE0000) {
2762 q += 4;
2763 bo = -1;
2764 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002765#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002767 }
2768
2769 if (bo == -1) {
2770 /* force LE */
2771 iorder[0] = 0;
2772 iorder[1] = 1;
2773 iorder[2] = 2;
2774 iorder[3] = 3;
2775 }
2776 else if (bo == 1) {
2777 /* force BE */
2778 iorder[0] = 3;
2779 iorder[1] = 2;
2780 iorder[2] = 1;
2781 iorder[3] = 0;
2782 }
2783
2784 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 Py_UCS4 ch;
2786 /* remaining bytes at the end? (size should be divisible by 4) */
2787 if (e-q<4) {
2788 if (consumed)
2789 break;
2790 errmsg = "truncated data";
2791 startinpos = ((const char *)q)-starts;
2792 endinpos = ((const char *)e)-starts;
2793 goto utf32Error;
2794 /* The remaining input chars are ignored if the callback
2795 chooses to skip the input */
2796 }
2797 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2798 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002799
Benjamin Peterson29060642009-01-31 22:14:21 +00002800 if (ch >= 0x110000)
2801 {
2802 errmsg = "codepoint not in range(0x110000)";
2803 startinpos = ((const char *)q)-starts;
2804 endinpos = startinpos+4;
2805 goto utf32Error;
2806 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002807#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002808 if (ch >= 0x10000)
2809 {
2810 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2811 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2812 }
2813 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002814#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002815 *p++ = ch;
2816 q += 4;
2817 continue;
2818 utf32Error:
2819 outpos = p-PyUnicode_AS_UNICODE(unicode);
2820 if (unicode_decode_call_errorhandler(
2821 errors, &errorHandler,
2822 "utf32", errmsg,
2823 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2824 &unicode, &outpos, &p))
2825 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002826 }
2827
2828 if (byteorder)
2829 *byteorder = bo;
2830
2831 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002833
2834 /* Adjust length */
2835 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2836 goto onError;
2837
2838 Py_XDECREF(errorHandler);
2839 Py_XDECREF(exc);
2840 return (PyObject *)unicode;
2841
Benjamin Peterson29060642009-01-31 22:14:21 +00002842 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002843 Py_DECREF(unicode);
2844 Py_XDECREF(errorHandler);
2845 Py_XDECREF(exc);
2846 return NULL;
2847}
2848
2849PyObject *
2850PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 Py_ssize_t size,
2852 const char *errors,
2853 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002854{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002855 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002856 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002857 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002858#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002859 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002860#else
2861 const int pairs = 0;
2862#endif
2863 /* Offsets from p for storing byte pairs in the right order. */
2864#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2865 int iorder[] = {0, 1, 2, 3};
2866#else
2867 int iorder[] = {3, 2, 1, 0};
2868#endif
2869
Benjamin Peterson29060642009-01-31 22:14:21 +00002870#define STORECHAR(CH) \
2871 do { \
2872 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2873 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2874 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2875 p[iorder[0]] = (CH) & 0xff; \
2876 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002877 } while(0)
2878
2879 /* In narrow builds we can output surrogate pairs as one codepoint,
2880 so we need less space. */
2881#ifndef Py_UNICODE_WIDE
2882 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002883 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2884 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2885 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002886#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002887 nsize = (size - pairs + (byteorder == 0));
2888 bytesize = nsize * 4;
2889 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002890 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002891 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002892 if (v == NULL)
2893 return NULL;
2894
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002895 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002896 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002898 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002899 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002900
2901 if (byteorder == -1) {
2902 /* force LE */
2903 iorder[0] = 0;
2904 iorder[1] = 1;
2905 iorder[2] = 2;
2906 iorder[3] = 3;
2907 }
2908 else if (byteorder == 1) {
2909 /* force BE */
2910 iorder[0] = 3;
2911 iorder[1] = 2;
2912 iorder[2] = 1;
2913 iorder[3] = 0;
2914 }
2915
2916 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002917 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002918#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2920 Py_UCS4 ch2 = *s;
2921 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2922 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2923 s++;
2924 size--;
2925 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002926 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002927#endif
2928 STORECHAR(ch);
2929 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002930
2931 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002932 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002933#undef STORECHAR
2934}
2935
2936PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2937{
2938 if (!PyUnicode_Check(unicode)) {
2939 PyErr_BadArgument();
2940 return NULL;
2941 }
2942 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002943 PyUnicode_GET_SIZE(unicode),
2944 NULL,
2945 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002946}
2947
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948/* --- UTF-16 Codec ------------------------------------------------------- */
2949
Tim Peters772747b2001-08-09 22:21:55 +00002950PyObject *
2951PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002952 Py_ssize_t size,
2953 const char *errors,
2954 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955{
Walter Dörwald69652032004-09-07 20:24:22 +00002956 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2957}
2958
Antoine Pitrouab868312009-01-10 15:40:25 +00002959/* Two masks for fast checking of whether a C 'long' may contain
2960 UTF16-encoded surrogate characters. This is an efficient heuristic,
2961 assuming that non-surrogate characters with a code point >= 0x8000 are
2962 rare in most input.
2963 FAST_CHAR_MASK is used when the input is in native byte ordering,
2964 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002965*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002966#if (SIZEOF_LONG == 8)
2967# define FAST_CHAR_MASK 0x8000800080008000L
2968# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2969#elif (SIZEOF_LONG == 4)
2970# define FAST_CHAR_MASK 0x80008000L
2971# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2972#else
2973# error C 'long' size should be either 4 or 8!
2974#endif
2975
Walter Dörwald69652032004-09-07 20:24:22 +00002976PyObject *
2977PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 Py_ssize_t size,
2979 const char *errors,
2980 int *byteorder,
2981 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002982{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002983 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002984 Py_ssize_t startinpos;
2985 Py_ssize_t endinpos;
2986 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 PyUnicodeObject *unicode;
2988 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002989 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002990 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002991 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002992 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002993 /* Offsets from q for retrieving byte pairs in the right order. */
2994#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2995 int ihi = 1, ilo = 0;
2996#else
2997 int ihi = 0, ilo = 1;
2998#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002999 PyObject *errorHandler = NULL;
3000 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001
3002 /* Note: size will always be longer than the resulting Unicode
3003 character count */
3004 unicode = _PyUnicode_New(size);
3005 if (!unicode)
3006 return NULL;
3007 if (size == 0)
3008 return (PyObject *)unicode;
3009
3010 /* Unpack UTF-16 encoded data */
3011 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003012 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003013 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014
3015 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003016 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003018 /* Check for BOM marks (U+FEFF) in the input and adjust current
3019 byte order setting accordingly. In native mode, the leading BOM
3020 mark is skipped, in all other modes, it is copied to the output
3021 stream as-is (giving a ZWNBSP character). */
3022 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003023 if (size >= 2) {
3024 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003025#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 if (bom == 0xFEFF) {
3027 q += 2;
3028 bo = -1;
3029 }
3030 else if (bom == 0xFFFE) {
3031 q += 2;
3032 bo = 1;
3033 }
Tim Petersced69f82003-09-16 20:30:58 +00003034#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 if (bom == 0xFEFF) {
3036 q += 2;
3037 bo = 1;
3038 }
3039 else if (bom == 0xFFFE) {
3040 q += 2;
3041 bo = -1;
3042 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003043#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046
Tim Peters772747b2001-08-09 22:21:55 +00003047 if (bo == -1) {
3048 /* force LE */
3049 ihi = 1;
3050 ilo = 0;
3051 }
3052 else if (bo == 1) {
3053 /* force BE */
3054 ihi = 0;
3055 ilo = 1;
3056 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003057#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3058 native_ordering = ilo < ihi;
3059#else
3060 native_ordering = ilo > ihi;
3061#endif
Tim Peters772747b2001-08-09 22:21:55 +00003062
Antoine Pitrouab868312009-01-10 15:40:25 +00003063 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003064 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003066 /* First check for possible aligned read of a C 'long'. Unaligned
3067 reads are more expensive, better to defer to another iteration. */
3068 if (!((size_t) q & LONG_PTR_MASK)) {
3069 /* Fast path for runs of non-surrogate chars. */
3070 register const unsigned char *_q = q;
3071 Py_UNICODE *_p = p;
3072 if (native_ordering) {
3073 /* Native ordering is simple: as long as the input cannot
3074 possibly contain a surrogate char, do an unrolled copy
3075 of several 16-bit code points to the target object.
3076 The non-surrogate check is done on several input bytes
3077 at a time (as many as a C 'long' can contain). */
3078 while (_q < aligned_end) {
3079 unsigned long data = * (unsigned long *) _q;
3080 if (data & FAST_CHAR_MASK)
3081 break;
3082 _p[0] = ((unsigned short *) _q)[0];
3083 _p[1] = ((unsigned short *) _q)[1];
3084#if (SIZEOF_LONG == 8)
3085 _p[2] = ((unsigned short *) _q)[2];
3086 _p[3] = ((unsigned short *) _q)[3];
3087#endif
3088 _q += SIZEOF_LONG;
3089 _p += SIZEOF_LONG / 2;
3090 }
3091 }
3092 else {
3093 /* Byteswapped ordering is similar, but we must decompose
3094 the copy bytewise, and take care of zero'ing out the
3095 upper bytes if the target object is in 32-bit units
3096 (that is, in UCS-4 builds). */
3097 while (_q < aligned_end) {
3098 unsigned long data = * (unsigned long *) _q;
3099 if (data & SWAPPED_FAST_CHAR_MASK)
3100 break;
3101 /* Zero upper bytes in UCS-4 builds */
3102#if (Py_UNICODE_SIZE > 2)
3103 _p[0] = 0;
3104 _p[1] = 0;
3105#if (SIZEOF_LONG == 8)
3106 _p[2] = 0;
3107 _p[3] = 0;
3108#endif
3109#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003110 /* Issue #4916; UCS-4 builds on big endian machines must
3111 fill the two last bytes of each 4-byte unit. */
3112#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3113# define OFF 2
3114#else
3115# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003116#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003117 ((unsigned char *) _p)[OFF + 1] = _q[0];
3118 ((unsigned char *) _p)[OFF + 0] = _q[1];
3119 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3120 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3121#if (SIZEOF_LONG == 8)
3122 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3123 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3124 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3125 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3126#endif
3127#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003128 _q += SIZEOF_LONG;
3129 _p += SIZEOF_LONG / 2;
3130 }
3131 }
3132 p = _p;
3133 q = _q;
3134 if (q >= e)
3135 break;
3136 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003137 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138
Benjamin Peterson14339b62009-01-31 16:36:08 +00003139 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003140
3141 if (ch < 0xD800 || ch > 0xDFFF) {
3142 *p++ = ch;
3143 continue;
3144 }
3145
3146 /* UTF-16 code pair: */
3147 if (q > e) {
3148 errmsg = "unexpected end of data";
3149 startinpos = (((const char *)q) - 2) - starts;
3150 endinpos = ((const char *)e) + 1 - starts;
3151 goto utf16Error;
3152 }
3153 if (0xD800 <= ch && ch <= 0xDBFF) {
3154 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3155 q += 2;
3156 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003157#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003158 *p++ = ch;
3159 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003160#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003162#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003163 continue;
3164 }
3165 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003166 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 startinpos = (((const char *)q)-4)-starts;
3168 endinpos = startinpos+2;
3169 goto utf16Error;
3170 }
3171
Benjamin Peterson14339b62009-01-31 16:36:08 +00003172 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003173 errmsg = "illegal encoding";
3174 startinpos = (((const char *)q)-2)-starts;
3175 endinpos = startinpos+2;
3176 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003177
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 utf16Error:
3179 outpos = p - PyUnicode_AS_UNICODE(unicode);
3180 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003181 errors,
3182 &errorHandler,
3183 "utf16", errmsg,
3184 &starts,
3185 (const char **)&e,
3186 &startinpos,
3187 &endinpos,
3188 &exc,
3189 (const char **)&q,
3190 &unicode,
3191 &outpos,
3192 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003195 /* remaining byte at the end? (size should be even) */
3196 if (e == q) {
3197 if (!consumed) {
3198 errmsg = "truncated data";
3199 startinpos = ((const char *)q) - starts;
3200 endinpos = ((const char *)e) + 1 - starts;
3201 outpos = p - PyUnicode_AS_UNICODE(unicode);
3202 if (unicode_decode_call_errorhandler(
3203 errors,
3204 &errorHandler,
3205 "utf16", errmsg,
3206 &starts,
3207 (const char **)&e,
3208 &startinpos,
3209 &endinpos,
3210 &exc,
3211 (const char **)&q,
3212 &unicode,
3213 &outpos,
3214 &p))
3215 goto onError;
3216 /* The remaining input chars are ignored if the callback
3217 chooses to skip the input */
3218 }
3219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220
3221 if (byteorder)
3222 *byteorder = bo;
3223
Walter Dörwald69652032004-09-07 20:24:22 +00003224 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003226
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003228 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 goto onError;
3230
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231 Py_XDECREF(errorHandler);
3232 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233 return (PyObject *)unicode;
3234
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003237 Py_XDECREF(errorHandler);
3238 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 return NULL;
3240}
3241
Antoine Pitrouab868312009-01-10 15:40:25 +00003242#undef FAST_CHAR_MASK
3243#undef SWAPPED_FAST_CHAR_MASK
3244
Tim Peters772747b2001-08-09 22:21:55 +00003245PyObject *
3246PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 Py_ssize_t size,
3248 const char *errors,
3249 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003251 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003252 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003253 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003254#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003255 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003256#else
3257 const int pairs = 0;
3258#endif
Tim Peters772747b2001-08-09 22:21:55 +00003259 /* Offsets from p for storing byte pairs in the right order. */
3260#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3261 int ihi = 1, ilo = 0;
3262#else
3263 int ihi = 0, ilo = 1;
3264#endif
3265
Benjamin Peterson29060642009-01-31 22:14:21 +00003266#define STORECHAR(CH) \
3267 do { \
3268 p[ihi] = ((CH) >> 8) & 0xff; \
3269 p[ilo] = (CH) & 0xff; \
3270 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003271 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003273#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003274 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 if (s[i] >= 0x10000)
3276 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003277#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003278 /* 2 * (size + pairs + (byteorder == 0)) */
3279 if (size > PY_SSIZE_T_MAX ||
3280 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003282 nsize = size + pairs + (byteorder == 0);
3283 bytesize = nsize * 2;
3284 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003286 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 if (v == NULL)
3288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003290 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003293 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003294 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003295
3296 if (byteorder == -1) {
3297 /* force LE */
3298 ihi = 1;
3299 ilo = 0;
3300 }
3301 else if (byteorder == 1) {
3302 /* force BE */
3303 ihi = 0;
3304 ilo = 1;
3305 }
3306
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003307 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003308 Py_UNICODE ch = *s++;
3309 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003310#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 if (ch >= 0x10000) {
3312 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3313 ch = 0xD800 | ((ch-0x10000) >> 10);
3314 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003315#endif
Tim Peters772747b2001-08-09 22:21:55 +00003316 STORECHAR(ch);
3317 if (ch2)
3318 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003319 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003320
3321 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003322 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003323#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324}
3325
3326PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3327{
3328 if (!PyUnicode_Check(unicode)) {
3329 PyErr_BadArgument();
3330 return NULL;
3331 }
3332 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 PyUnicode_GET_SIZE(unicode),
3334 NULL,
3335 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336}
3337
3338/* --- Unicode Escape Codec ----------------------------------------------- */
3339
Fredrik Lundh06d12682001-01-24 07:59:11 +00003340static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003341
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003343 Py_ssize_t size,
3344 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003347 Py_ssize_t startinpos;
3348 Py_ssize_t endinpos;
3349 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003354 char* message;
3355 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356 PyObject *errorHandler = NULL;
3357 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003358
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 /* Escaped strings will always be longer than the resulting
3360 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003361 length after conversion to the true value.
3362 (but if the error callback returns a long replacement string
3363 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 v = _PyUnicode_New(size);
3365 if (v == NULL)
3366 goto onError;
3367 if (size == 0)
3368 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003372
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 while (s < end) {
3374 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003375 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377
3378 /* Non-escape characters are interpreted as Unicode ordinals */
3379 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003380 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381 continue;
3382 }
3383
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 /* \ - Escapes */
3386 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003387 c = *s++;
3388 if (s > end)
3389 c = '\0'; /* Invalid after \ */
3390 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391
Benjamin Peterson29060642009-01-31 22:14:21 +00003392 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 case '\n': break;
3394 case '\\': *p++ = '\\'; break;
3395 case '\'': *p++ = '\''; break;
3396 case '\"': *p++ = '\"'; break;
3397 case 'b': *p++ = '\b'; break;
3398 case 'f': *p++ = '\014'; break; /* FF */
3399 case 't': *p++ = '\t'; break;
3400 case 'n': *p++ = '\n'; break;
3401 case 'r': *p++ = '\r'; break;
3402 case 'v': *p++ = '\013'; break; /* VT */
3403 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3404
Benjamin Peterson29060642009-01-31 22:14:21 +00003405 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406 case '0': case '1': case '2': case '3':
3407 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003408 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003409 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003410 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003411 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003412 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003414 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415 break;
3416
Benjamin Peterson29060642009-01-31 22:14:21 +00003417 /* hex escapes */
3418 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003420 digits = 2;
3421 message = "truncated \\xXX escape";
3422 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003426 digits = 4;
3427 message = "truncated \\uXXXX escape";
3428 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429
Benjamin Peterson29060642009-01-31 22:14:21 +00003430 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003431 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003432 digits = 8;
3433 message = "truncated \\UXXXXXXXX escape";
3434 hexescape:
3435 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 outpos = p-PyUnicode_AS_UNICODE(v);
3437 if (s+digits>end) {
3438 endinpos = size;
3439 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003440 errors, &errorHandler,
3441 "unicodeescape", "end of string in escape sequence",
3442 &starts, &end, &startinpos, &endinpos, &exc, &s,
3443 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 goto onError;
3445 goto nextByte;
3446 }
3447 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003448 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003449 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 endinpos = (s+i+1)-starts;
3451 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 errors, &errorHandler,
3453 "unicodeescape", message,
3454 &starts, &end, &startinpos, &endinpos, &exc, &s,
3455 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003456 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003458 }
3459 chr = (chr<<4) & ~0xF;
3460 if (c >= '0' && c <= '9')
3461 chr += c - '0';
3462 else if (c >= 'a' && c <= 'f')
3463 chr += 10 + c - 'a';
3464 else
3465 chr += 10 + c - 'A';
3466 }
3467 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003468 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 /* _decoding_error will have already written into the
3470 target buffer. */
3471 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003472 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003473 /* when we get here, chr is a 32-bit unicode character */
3474 if (chr <= 0xffff)
3475 /* UCS-2 character */
3476 *p++ = (Py_UNICODE) chr;
3477 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003478 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003479 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003480#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003481 *p++ = chr;
3482#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003483 chr -= 0x10000L;
3484 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003485 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003486#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003487 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 endinpos = s-starts;
3489 outpos = p-PyUnicode_AS_UNICODE(v);
3490 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003491 errors, &errorHandler,
3492 "unicodeescape", "illegal Unicode character",
3493 &starts, &end, &startinpos, &endinpos, &exc, &s,
3494 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003495 goto onError;
3496 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003497 break;
3498
Benjamin Peterson29060642009-01-31 22:14:21 +00003499 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003500 case 'N':
3501 message = "malformed \\N character escape";
3502 if (ucnhash_CAPI == NULL) {
3503 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003504 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003505 if (ucnhash_CAPI == NULL)
3506 goto ucnhashError;
3507 }
3508 if (*s == '{') {
3509 const char *start = s+1;
3510 /* look for the closing brace */
3511 while (*s != '}' && s < end)
3512 s++;
3513 if (s > start && s < end && *s == '}') {
3514 /* found a name. look it up in the unicode database */
3515 message = "unknown Unicode character name";
3516 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003517 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003518 goto store;
3519 }
3520 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 endinpos = s-starts;
3522 outpos = p-PyUnicode_AS_UNICODE(v);
3523 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003524 errors, &errorHandler,
3525 "unicodeescape", message,
3526 &starts, &end, &startinpos, &endinpos, &exc, &s,
3527 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003528 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003529 break;
3530
3531 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003532 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 message = "\\ at end of string";
3534 s--;
3535 endinpos = s-starts;
3536 outpos = p-PyUnicode_AS_UNICODE(v);
3537 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003538 errors, &errorHandler,
3539 "unicodeescape", message,
3540 &starts, &end, &startinpos, &endinpos, &exc, &s,
3541 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003542 goto onError;
3543 }
3544 else {
3545 *p++ = '\\';
3546 *p++ = (unsigned char)s[-1];
3547 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003548 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003553 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003555 Py_XDECREF(errorHandler);
3556 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003558
Benjamin Peterson29060642009-01-31 22:14:21 +00003559 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003560 PyErr_SetString(
3561 PyExc_UnicodeError,
3562 "\\N escapes not supported (can't load unicodedata module)"
3563 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003564 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_XDECREF(errorHandler);
3566 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003567 return NULL;
3568
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 Py_XDECREF(errorHandler);
3572 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 return NULL;
3574}
3575
3576/* Return a Unicode-Escape string version of the Unicode object.
3577
3578 If quotes is true, the string is enclosed in u"" or u'' quotes as
3579 appropriate.
3580
3581*/
3582
Thomas Wouters477c8d52006-05-27 19:21:47 +00003583Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 Py_ssize_t size,
3585 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003586{
3587 /* like wcschr, but doesn't stop at NULL characters */
3588
3589 while (size-- > 0) {
3590 if (*s == ch)
3591 return s;
3592 s++;
3593 }
3594
3595 return NULL;
3596}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003597
Walter Dörwald79e913e2007-05-12 11:08:06 +00003598static const char *hexdigits = "0123456789abcdef";
3599
3600PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003603 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003606#ifdef Py_UNICODE_WIDE
3607 const Py_ssize_t expandsize = 10;
3608#else
3609 const Py_ssize_t expandsize = 6;
3610#endif
3611
Thomas Wouters89f507f2006-12-13 04:49:30 +00003612 /* XXX(nnorwitz): rather than over-allocating, it would be
3613 better to choose a different scheme. Perhaps scan the
3614 first N-chars of the string and allocate based on that size.
3615 */
3616 /* Initial allocation is based on the longest-possible unichr
3617 escape.
3618
3619 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3620 unichr, so in this case it's the longest unichr escape. In
3621 narrow (UTF-16) builds this is five chars per source unichr
3622 since there are two unichrs in the surrogate pair, so in narrow
3623 (UTF-16) builds it's not the longest unichr escape.
3624
3625 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3626 so in the narrow (UTF-16) build case it's the longest unichr
3627 escape.
3628 */
3629
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003630 if (size == 0)
3631 return PyBytes_FromStringAndSize(NULL, 0);
3632
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003633 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003635
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003636 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 2
3638 + expandsize*size
3639 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 if (repr == NULL)
3641 return NULL;
3642
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003643 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 while (size-- > 0) {
3646 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003647
Walter Dörwald79e913e2007-05-12 11:08:06 +00003648 /* Escape backslashes */
3649 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 *p++ = '\\';
3651 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003652 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003653 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003654
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003655#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003656 /* Map 21-bit characters to '\U00xxxxxx' */
3657 else if (ch >= 0x10000) {
3658 *p++ = '\\';
3659 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003660 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3661 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3662 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3663 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3664 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3665 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3666 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3667 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003669 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003670#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3672 else if (ch >= 0xD800 && ch < 0xDC00) {
3673 Py_UNICODE ch2;
3674 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003675
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 ch2 = *s++;
3677 size--;
3678 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3679 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3680 *p++ = '\\';
3681 *p++ = 'U';
3682 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3683 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3684 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3685 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3686 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3687 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3688 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3689 *p++ = hexdigits[ucs & 0x0000000F];
3690 continue;
3691 }
3692 /* Fall through: isolated surrogates are copied as-is */
3693 s--;
3694 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003695 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003696#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003697
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003699 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 *p++ = '\\';
3701 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003702 *p++ = hexdigits[(ch >> 12) & 0x000F];
3703 *p++ = hexdigits[(ch >> 8) & 0x000F];
3704 *p++ = hexdigits[(ch >> 4) & 0x000F];
3705 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003707
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003708 /* Map special whitespace to '\t', \n', '\r' */
3709 else if (ch == '\t') {
3710 *p++ = '\\';
3711 *p++ = 't';
3712 }
3713 else if (ch == '\n') {
3714 *p++ = '\\';
3715 *p++ = 'n';
3716 }
3717 else if (ch == '\r') {
3718 *p++ = '\\';
3719 *p++ = 'r';
3720 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003721
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003722 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003723 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003725 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003726 *p++ = hexdigits[(ch >> 4) & 0x000F];
3727 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003728 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003729
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 /* Copy everything else as-is */
3731 else
3732 *p++ = (char) ch;
3733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003735 assert(p - PyBytes_AS_STRING(repr) > 0);
3736 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3737 return NULL;
3738 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739}
3740
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003741PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003743 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 if (!PyUnicode_Check(unicode)) {
3745 PyErr_BadArgument();
3746 return NULL;
3747 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003748 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3749 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003750 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751}
3752
3753/* --- Raw Unicode Escape Codec ------------------------------------------- */
3754
3755PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 Py_ssize_t size,
3757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003760 Py_ssize_t startinpos;
3761 Py_ssize_t endinpos;
3762 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 const char *end;
3766 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 PyObject *errorHandler = NULL;
3768 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003769
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 /* Escaped strings will always be longer than the resulting
3771 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 length after conversion to the true value. (But decoding error
3773 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 v = _PyUnicode_New(size);
3775 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003776 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 end = s + size;
3781 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 unsigned char c;
3783 Py_UCS4 x;
3784 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003785 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 /* Non-escape characters are interpreted as Unicode ordinals */
3788 if (*s != '\\') {
3789 *p++ = (unsigned char)*s++;
3790 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003791 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003792 startinpos = s-starts;
3793
3794 /* \u-escapes are only interpreted iff the number of leading
3795 backslashes if odd */
3796 bs = s;
3797 for (;s < end;) {
3798 if (*s != '\\')
3799 break;
3800 *p++ = (unsigned char)*s++;
3801 }
3802 if (((s - bs) & 1) == 0 ||
3803 s >= end ||
3804 (*s != 'u' && *s != 'U')) {
3805 continue;
3806 }
3807 p--;
3808 count = *s=='u' ? 4 : 8;
3809 s++;
3810
3811 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3812 outpos = p-PyUnicode_AS_UNICODE(v);
3813 for (x = 0, i = 0; i < count; ++i, ++s) {
3814 c = (unsigned char)*s;
3815 if (!ISXDIGIT(c)) {
3816 endinpos = s-starts;
3817 if (unicode_decode_call_errorhandler(
3818 errors, &errorHandler,
3819 "rawunicodeescape", "truncated \\uXXXX",
3820 &starts, &end, &startinpos, &endinpos, &exc, &s,
3821 &v, &outpos, &p))
3822 goto onError;
3823 goto nextByte;
3824 }
3825 x = (x<<4) & ~0xF;
3826 if (c >= '0' && c <= '9')
3827 x += c - '0';
3828 else if (c >= 'a' && c <= 'f')
3829 x += 10 + c - 'a';
3830 else
3831 x += 10 + c - 'A';
3832 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003833 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003834 /* UCS-2 character */
3835 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003836 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 /* UCS-4 character. Either store directly, or as
3838 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003839#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003841#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 x -= 0x10000L;
3843 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3844 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003845#endif
3846 } else {
3847 endinpos = s-starts;
3848 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003849 if (unicode_decode_call_errorhandler(
3850 errors, &errorHandler,
3851 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003852 &starts, &end, &startinpos, &endinpos, &exc, &s,
3853 &v, &outpos, &p))
3854 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003855 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003856 nextByte:
3857 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003859 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003860 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 Py_XDECREF(errorHandler);
3862 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003864
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 Py_XDECREF(errorHandler);
3868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 return NULL;
3870}
3871
3872PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003873 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003875 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 char *p;
3877 char *q;
3878
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003879#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003880 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003881#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003882 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003883#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003884
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003885 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003887
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003888 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 if (repr == NULL)
3890 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003891 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003892 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003894 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 while (size-- > 0) {
3896 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003897#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003898 /* Map 32-bit characters to '\Uxxxxxxxx' */
3899 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003900 *p++ = '\\';
3901 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003902 *p++ = hexdigits[(ch >> 28) & 0xf];
3903 *p++ = hexdigits[(ch >> 24) & 0xf];
3904 *p++ = hexdigits[(ch >> 20) & 0xf];
3905 *p++ = hexdigits[(ch >> 16) & 0xf];
3906 *p++ = hexdigits[(ch >> 12) & 0xf];
3907 *p++ = hexdigits[(ch >> 8) & 0xf];
3908 *p++ = hexdigits[(ch >> 4) & 0xf];
3909 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003910 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003911 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003912#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003913 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3914 if (ch >= 0xD800 && ch < 0xDC00) {
3915 Py_UNICODE ch2;
3916 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003917
Benjamin Peterson29060642009-01-31 22:14:21 +00003918 ch2 = *s++;
3919 size--;
3920 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3921 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3922 *p++ = '\\';
3923 *p++ = 'U';
3924 *p++ = hexdigits[(ucs >> 28) & 0xf];
3925 *p++ = hexdigits[(ucs >> 24) & 0xf];
3926 *p++ = hexdigits[(ucs >> 20) & 0xf];
3927 *p++ = hexdigits[(ucs >> 16) & 0xf];
3928 *p++ = hexdigits[(ucs >> 12) & 0xf];
3929 *p++ = hexdigits[(ucs >> 8) & 0xf];
3930 *p++ = hexdigits[(ucs >> 4) & 0xf];
3931 *p++ = hexdigits[ucs & 0xf];
3932 continue;
3933 }
3934 /* Fall through: isolated surrogates are copied as-is */
3935 s--;
3936 size++;
3937 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003938#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003939 /* Map 16-bit characters to '\uxxxx' */
3940 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 *p++ = '\\';
3942 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003943 *p++ = hexdigits[(ch >> 12) & 0xf];
3944 *p++ = hexdigits[(ch >> 8) & 0xf];
3945 *p++ = hexdigits[(ch >> 4) & 0xf];
3946 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003948 /* Copy everything else as-is */
3949 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 *p++ = (char) ch;
3951 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003952 size = p - q;
3953
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003954 assert(size > 0);
3955 if (_PyBytes_Resize(&repr, size) < 0)
3956 return NULL;
3957 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958}
3959
3960PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3961{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003962 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003964 PyErr_BadArgument();
3965 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003967 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3968 PyUnicode_GET_SIZE(unicode));
3969
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003970 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971}
3972
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003973/* --- Unicode Internal Codec ------------------------------------------- */
3974
3975PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 Py_ssize_t size,
3977 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003978{
3979 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t startinpos;
3981 Py_ssize_t endinpos;
3982 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003983 PyUnicodeObject *v;
3984 Py_UNICODE *p;
3985 const char *end;
3986 const char *reason;
3987 PyObject *errorHandler = NULL;
3988 PyObject *exc = NULL;
3989
Neal Norwitzd43069c2006-01-08 01:12:10 +00003990#ifdef Py_UNICODE_WIDE
3991 Py_UNICODE unimax = PyUnicode_GetMax();
3992#endif
3993
Thomas Wouters89f507f2006-12-13 04:49:30 +00003994 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003995 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3996 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003998 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004000 p = PyUnicode_AS_UNICODE(v);
4001 end = s + size;
4002
4003 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004004 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004005 /* We have to sanity check the raw data, otherwise doom looms for
4006 some malformed UCS-4 data. */
4007 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004008#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004009 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004010#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004011 end-s < Py_UNICODE_SIZE
4012 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004013 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004014 startinpos = s - starts;
4015 if (end-s < Py_UNICODE_SIZE) {
4016 endinpos = end-starts;
4017 reason = "truncated input";
4018 }
4019 else {
4020 endinpos = s - starts + Py_UNICODE_SIZE;
4021 reason = "illegal code point (> 0x10FFFF)";
4022 }
4023 outpos = p - PyUnicode_AS_UNICODE(v);
4024 if (unicode_decode_call_errorhandler(
4025 errors, &errorHandler,
4026 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004027 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004028 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004029 goto onError;
4030 }
4031 }
4032 else {
4033 p++;
4034 s += Py_UNICODE_SIZE;
4035 }
4036 }
4037
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004038 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004039 goto onError;
4040 Py_XDECREF(errorHandler);
4041 Py_XDECREF(exc);
4042 return (PyObject *)v;
4043
Benjamin Peterson29060642009-01-31 22:14:21 +00004044 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004045 Py_XDECREF(v);
4046 Py_XDECREF(errorHandler);
4047 Py_XDECREF(exc);
4048 return NULL;
4049}
4050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051/* --- Latin-1 Codec ------------------------------------------------------ */
4052
4053PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 Py_ssize_t size,
4055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056{
4057 PyUnicodeObject *v;
4058 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004059 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004060
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004062 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 Py_UNICODE r = *(unsigned char*)s;
4064 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004065 }
4066
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 v = _PyUnicode_New(size);
4068 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004071 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004073 e = s + size;
4074 /* Unrolling the copy makes it much faster by reducing the looping
4075 overhead. This is similar to what many memcpy() implementations do. */
4076 unrolled_end = e - 4;
4077 while (s < unrolled_end) {
4078 p[0] = (unsigned char) s[0];
4079 p[1] = (unsigned char) s[1];
4080 p[2] = (unsigned char) s[2];
4081 p[3] = (unsigned char) s[3];
4082 s += 4;
4083 p += 4;
4084 }
4085 while (s < e)
4086 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004088
Benjamin Peterson29060642009-01-31 22:14:21 +00004089 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 Py_XDECREF(v);
4091 return NULL;
4092}
4093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094/* create or adjust a UnicodeEncodeError */
4095static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004096 const char *encoding,
4097 const Py_UNICODE *unicode, Py_ssize_t size,
4098 Py_ssize_t startpos, Py_ssize_t endpos,
4099 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 *exceptionObject = PyUnicodeEncodeError_Create(
4103 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 }
4105 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4107 goto onError;
4108 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4109 goto onError;
4110 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4111 goto onError;
4112 return;
4113 onError:
4114 Py_DECREF(*exceptionObject);
4115 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 }
4117}
4118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119/* raises a UnicodeEncodeError */
4120static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 const char *encoding,
4122 const Py_UNICODE *unicode, Py_ssize_t size,
4123 Py_ssize_t startpos, Py_ssize_t endpos,
4124 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125{
4126 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004127 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130}
4131
4132/* error handling callback helper:
4133 build arguments, call the callback and check the arguments,
4134 put the result into newpos and return the replacement string, which
4135 has to be freed by the caller */
4136static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004137 PyObject **errorHandler,
4138 const char *encoding, const char *reason,
4139 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4140 Py_ssize_t startpos, Py_ssize_t endpos,
4141 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004143 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144
4145 PyObject *restuple;
4146 PyObject *resunicode;
4147
4148 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 }
4153
4154 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004157 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158
4159 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004160 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004164 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004165 Py_DECREF(restuple);
4166 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004168 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 &resunicode, newpos)) {
4170 Py_DECREF(restuple);
4171 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004173 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4174 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4175 Py_DECREF(restuple);
4176 return NULL;
4177 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004180 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4182 Py_DECREF(restuple);
4183 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004184 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 Py_INCREF(resunicode);
4186 Py_DECREF(restuple);
4187 return resunicode;
4188}
4189
4190static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 Py_ssize_t size,
4192 const char *errors,
4193 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194{
4195 /* output object */
4196 PyObject *res;
4197 /* pointers to the beginning and end+1 of input */
4198 const Py_UNICODE *startp = p;
4199 const Py_UNICODE *endp = p + size;
4200 /* pointer to the beginning of the unencodable characters */
4201 /* const Py_UNICODE *badp = NULL; */
4202 /* pointer into the output */
4203 char *str;
4204 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004205 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004206 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4207 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 PyObject *errorHandler = NULL;
4209 PyObject *exc = NULL;
4210 /* the following variable is used for caching string comparisons
4211 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4212 int known_errorHandler = -1;
4213
4214 /* allocate enough for a simple encoding without
4215 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004216 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004217 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004218 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004220 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004221 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 ressize = size;
4223
4224 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 /* can we encode this? */
4228 if (c<limit) {
4229 /* no overflow check, because we know that the space is enough */
4230 *str++ = (char)c;
4231 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004232 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 else {
4234 Py_ssize_t unicodepos = p-startp;
4235 Py_ssize_t requiredsize;
4236 PyObject *repunicode;
4237 Py_ssize_t repsize;
4238 Py_ssize_t newpos;
4239 Py_ssize_t respos;
4240 Py_UNICODE *uni2;
4241 /* startpos for collecting unencodable chars */
4242 const Py_UNICODE *collstart = p;
4243 const Py_UNICODE *collend = p;
4244 /* find all unecodable characters */
4245 while ((collend < endp) && ((*collend)>=limit))
4246 ++collend;
4247 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4248 if (known_errorHandler==-1) {
4249 if ((errors==NULL) || (!strcmp(errors, "strict")))
4250 known_errorHandler = 1;
4251 else if (!strcmp(errors, "replace"))
4252 known_errorHandler = 2;
4253 else if (!strcmp(errors, "ignore"))
4254 known_errorHandler = 3;
4255 else if (!strcmp(errors, "xmlcharrefreplace"))
4256 known_errorHandler = 4;
4257 else
4258 known_errorHandler = 0;
4259 }
4260 switch (known_errorHandler) {
4261 case 1: /* strict */
4262 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4263 goto onError;
4264 case 2: /* replace */
4265 while (collstart++<collend)
4266 *str++ = '?'; /* fall through */
4267 case 3: /* ignore */
4268 p = collend;
4269 break;
4270 case 4: /* xmlcharrefreplace */
4271 respos = str - PyBytes_AS_STRING(res);
4272 /* determine replacement size (temporarily (mis)uses p) */
4273 for (p = collstart, repsize = 0; p < collend; ++p) {
4274 if (*p<10)
4275 repsize += 2+1+1;
4276 else if (*p<100)
4277 repsize += 2+2+1;
4278 else if (*p<1000)
4279 repsize += 2+3+1;
4280 else if (*p<10000)
4281 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004282#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 else
4284 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004285#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 else if (*p<100000)
4287 repsize += 2+5+1;
4288 else if (*p<1000000)
4289 repsize += 2+6+1;
4290 else
4291 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004292#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 }
4294 requiredsize = respos+repsize+(endp-collend);
4295 if (requiredsize > ressize) {
4296 if (requiredsize<2*ressize)
4297 requiredsize = 2*ressize;
4298 if (_PyBytes_Resize(&res, requiredsize))
4299 goto onError;
4300 str = PyBytes_AS_STRING(res) + respos;
4301 ressize = requiredsize;
4302 }
4303 /* generate replacement (temporarily (mis)uses p) */
4304 for (p = collstart; p < collend; ++p) {
4305 str += sprintf(str, "&#%d;", (int)*p);
4306 }
4307 p = collend;
4308 break;
4309 default:
4310 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4311 encoding, reason, startp, size, &exc,
4312 collstart-startp, collend-startp, &newpos);
4313 if (repunicode == NULL)
4314 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004315 if (PyBytes_Check(repunicode)) {
4316 /* Directly copy bytes result to output. */
4317 repsize = PyBytes_Size(repunicode);
4318 if (repsize > 1) {
4319 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004320 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004321 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4322 Py_DECREF(repunicode);
4323 goto onError;
4324 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004325 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004326 ressize += repsize-1;
4327 }
4328 memcpy(str, PyBytes_AsString(repunicode), repsize);
4329 str += repsize;
4330 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004331 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004332 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 /* need more space? (at least enough for what we
4335 have+the replacement+the rest of the string, so
4336 we won't have to check space for encodable characters) */
4337 respos = str - PyBytes_AS_STRING(res);
4338 repsize = PyUnicode_GET_SIZE(repunicode);
4339 requiredsize = respos+repsize+(endp-collend);
4340 if (requiredsize > ressize) {
4341 if (requiredsize<2*ressize)
4342 requiredsize = 2*ressize;
4343 if (_PyBytes_Resize(&res, requiredsize)) {
4344 Py_DECREF(repunicode);
4345 goto onError;
4346 }
4347 str = PyBytes_AS_STRING(res) + respos;
4348 ressize = requiredsize;
4349 }
4350 /* check if there is anything unencodable in the replacement
4351 and copy it to the output */
4352 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4353 c = *uni2;
4354 if (c >= limit) {
4355 raise_encode_exception(&exc, encoding, startp, size,
4356 unicodepos, unicodepos+1, reason);
4357 Py_DECREF(repunicode);
4358 goto onError;
4359 }
4360 *str = (char)c;
4361 }
4362 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004363 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004364 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004365 }
4366 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004367 /* Resize if we allocated to much */
4368 size = str - PyBytes_AS_STRING(res);
4369 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004370 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004371 if (_PyBytes_Resize(&res, size) < 0)
4372 goto onError;
4373 }
4374
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375 Py_XDECREF(errorHandler);
4376 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004377 return res;
4378
4379 onError:
4380 Py_XDECREF(res);
4381 Py_XDECREF(errorHandler);
4382 Py_XDECREF(exc);
4383 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384}
4385
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 Py_ssize_t size,
4388 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391}
4392
4393PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4394{
4395 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 PyErr_BadArgument();
4397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 }
4399 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 PyUnicode_GET_SIZE(unicode),
4401 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402}
4403
4404/* --- 7-bit ASCII Codec -------------------------------------------------- */
4405
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 Py_ssize_t size,
4408 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 PyUnicodeObject *v;
4412 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004413 Py_ssize_t startinpos;
4414 Py_ssize_t endinpos;
4415 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 const char *e;
4417 PyObject *errorHandler = NULL;
4418 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004419
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004421 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 Py_UNICODE r = *(unsigned char*)s;
4423 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004424 }
Tim Petersced69f82003-09-16 20:30:58 +00004425
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 v = _PyUnicode_New(size);
4427 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 e = s + size;
4433 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 register unsigned char c = (unsigned char)*s;
4435 if (c < 128) {
4436 *p++ = c;
4437 ++s;
4438 }
4439 else {
4440 startinpos = s-starts;
4441 endinpos = startinpos + 1;
4442 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4443 if (unicode_decode_call_errorhandler(
4444 errors, &errorHandler,
4445 "ascii", "ordinal not in range(128)",
4446 &starts, &e, &startinpos, &endinpos, &exc, &s,
4447 &v, &outpos, &p))
4448 goto onError;
4449 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004451 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4453 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 Py_XDECREF(errorHandler);
4455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004457
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460 Py_XDECREF(errorHandler);
4461 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 return NULL;
4463}
4464
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 Py_ssize_t size,
4467 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470}
4471
4472PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4473{
4474 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 PyErr_BadArgument();
4476 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 }
4478 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 PyUnicode_GET_SIZE(unicode),
4480 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481}
4482
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004483#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004484
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004485/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004486
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004487#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004488#define NEED_RETRY
4489#endif
4490
4491/* XXX This code is limited to "true" double-byte encodings, as
4492 a) it assumes an incomplete character consists of a single byte, and
4493 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004495
4496static int is_dbcs_lead_byte(const char *s, int offset)
4497{
4498 const char *curr = s + offset;
4499
4500 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 const char *prev = CharPrev(s, curr);
4502 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004503 }
4504 return 0;
4505}
4506
4507/*
4508 * Decode MBCS string into unicode object. If 'final' is set, converts
4509 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4510 */
4511static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 const char *s, /* MBCS string */
4513 int size, /* sizeof MBCS string */
4514 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004515{
4516 Py_UNICODE *p;
4517 Py_ssize_t n = 0;
4518 int usize = 0;
4519
4520 assert(size >= 0);
4521
4522 /* Skip trailing lead-byte unless 'final' is set */
4523 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004525
4526 /* First get the size of the result */
4527 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4529 if (usize == 0) {
4530 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4531 return -1;
4532 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004533 }
4534
4535 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 /* Create unicode object */
4537 *v = _PyUnicode_New(usize);
4538 if (*v == NULL)
4539 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004540 }
4541 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 /* Extend unicode object */
4543 n = PyUnicode_GET_SIZE(*v);
4544 if (_PyUnicode_Resize(v, n + usize) < 0)
4545 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004546 }
4547
4548 /* Do the conversion */
4549 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 p = PyUnicode_AS_UNICODE(*v) + n;
4551 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4552 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4553 return -1;
4554 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004555 }
4556
4557 return size;
4558}
4559
4560PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 Py_ssize_t size,
4562 const char *errors,
4563 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004564{
4565 PyUnicodeObject *v = NULL;
4566 int done;
4567
4568 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004570
4571#ifdef NEED_RETRY
4572 retry:
4573 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004575 else
4576#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004578
4579 if (done < 0) {
4580 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004582 }
4583
4584 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004586
4587#ifdef NEED_RETRY
4588 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 s += done;
4590 size -= done;
4591 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004592 }
4593#endif
4594
4595 return (PyObject *)v;
4596}
4597
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004598PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 Py_ssize_t size,
4600 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004601{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004602 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4603}
4604
4605/*
4606 * Convert unicode into string object (MBCS).
4607 * Returns 0 if succeed, -1 otherwise.
4608 */
4609static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 const Py_UNICODE *p, /* unicode */
4611 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004612{
4613 int mbcssize = 0;
4614 Py_ssize_t n = 0;
4615
4616 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004617
4618 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004619 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4621 if (mbcssize == 0) {
4622 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4623 return -1;
4624 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004625 }
4626
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004627 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 /* Create string object */
4629 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4630 if (*repr == NULL)
4631 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004632 }
4633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 /* Extend string object */
4635 n = PyBytes_Size(*repr);
4636 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4637 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004638 }
4639
4640 /* Do the conversion */
4641 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 char *s = PyBytes_AS_STRING(*repr) + n;
4643 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4644 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4645 return -1;
4646 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004647 }
4648
4649 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004650}
4651
4652PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004653 Py_ssize_t size,
4654 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004655{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004656 PyObject *repr = NULL;
4657 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004658
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004659#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004661 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004663 else
4664#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004666
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004667 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 Py_XDECREF(repr);
4669 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004670 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004671
4672#ifdef NEED_RETRY
4673 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 p += INT_MAX;
4675 size -= INT_MAX;
4676 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004677 }
4678#endif
4679
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004680 return repr;
4681}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004682
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004683PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4684{
4685 if (!PyUnicode_Check(unicode)) {
4686 PyErr_BadArgument();
4687 return NULL;
4688 }
4689 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 PyUnicode_GET_SIZE(unicode),
4691 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004692}
4693
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004694#undef NEED_RETRY
4695
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004696#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004697
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698/* --- Character Mapping Codec -------------------------------------------- */
4699
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 Py_ssize_t size,
4702 PyObject *mapping,
4703 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004706 Py_ssize_t startinpos;
4707 Py_ssize_t endinpos;
4708 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004709 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 PyUnicodeObject *v;
4711 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004712 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004713 PyObject *errorHandler = NULL;
4714 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004715 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004716 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004717
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 /* Default to Latin-1 */
4719 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721
4722 v = _PyUnicode_New(size);
4723 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004726 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004729 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 mapstring = PyUnicode_AS_UNICODE(mapping);
4731 maplen = PyUnicode_GET_SIZE(mapping);
4732 while (s < e) {
4733 unsigned char ch = *s;
4734 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 if (ch < maplen)
4737 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 if (x == 0xfffe) {
4740 /* undefined mapping */
4741 outpos = p-PyUnicode_AS_UNICODE(v);
4742 startinpos = s-starts;
4743 endinpos = startinpos+1;
4744 if (unicode_decode_call_errorhandler(
4745 errors, &errorHandler,
4746 "charmap", "character maps to <undefined>",
4747 &starts, &e, &startinpos, &endinpos, &exc, &s,
4748 &v, &outpos, &p)) {
4749 goto onError;
4750 }
4751 continue;
4752 }
4753 *p++ = x;
4754 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004755 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004756 }
4757 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 while (s < e) {
4759 unsigned char ch = *s;
4760 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004761
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4763 w = PyLong_FromLong((long)ch);
4764 if (w == NULL)
4765 goto onError;
4766 x = PyObject_GetItem(mapping, w);
4767 Py_DECREF(w);
4768 if (x == NULL) {
4769 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4770 /* No mapping found means: mapping is undefined. */
4771 PyErr_Clear();
4772 x = Py_None;
4773 Py_INCREF(x);
4774 } else
4775 goto onError;
4776 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004777
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 /* Apply mapping */
4779 if (PyLong_Check(x)) {
4780 long value = PyLong_AS_LONG(x);
4781 if (value < 0 || value > 65535) {
4782 PyErr_SetString(PyExc_TypeError,
4783 "character mapping must be in range(65536)");
4784 Py_DECREF(x);
4785 goto onError;
4786 }
4787 *p++ = (Py_UNICODE)value;
4788 }
4789 else if (x == Py_None) {
4790 /* undefined mapping */
4791 outpos = p-PyUnicode_AS_UNICODE(v);
4792 startinpos = s-starts;
4793 endinpos = startinpos+1;
4794 if (unicode_decode_call_errorhandler(
4795 errors, &errorHandler,
4796 "charmap", "character maps to <undefined>",
4797 &starts, &e, &startinpos, &endinpos, &exc, &s,
4798 &v, &outpos, &p)) {
4799 Py_DECREF(x);
4800 goto onError;
4801 }
4802 Py_DECREF(x);
4803 continue;
4804 }
4805 else if (PyUnicode_Check(x)) {
4806 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004807
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 if (targetsize == 1)
4809 /* 1-1 mapping */
4810 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004811
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 else if (targetsize > 1) {
4813 /* 1-n mapping */
4814 if (targetsize > extrachars) {
4815 /* resize first */
4816 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4817 Py_ssize_t needed = (targetsize - extrachars) + \
4818 (targetsize << 2);
4819 extrachars += needed;
4820 /* XXX overflow detection missing */
4821 if (_PyUnicode_Resize(&v,
4822 PyUnicode_GET_SIZE(v) + needed) < 0) {
4823 Py_DECREF(x);
4824 goto onError;
4825 }
4826 p = PyUnicode_AS_UNICODE(v) + oldpos;
4827 }
4828 Py_UNICODE_COPY(p,
4829 PyUnicode_AS_UNICODE(x),
4830 targetsize);
4831 p += targetsize;
4832 extrachars -= targetsize;
4833 }
4834 /* 1-0 mapping: skip the character */
4835 }
4836 else {
4837 /* wrong return value */
4838 PyErr_SetString(PyExc_TypeError,
4839 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004840 Py_DECREF(x);
4841 goto onError;
4842 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 Py_DECREF(x);
4844 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 }
4847 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4849 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 Py_XDECREF(errorHandler);
4851 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004853
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 Py_XDECREF(errorHandler);
4856 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 Py_XDECREF(v);
4858 return NULL;
4859}
4860
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004861/* Charmap encoding: the lookup table */
4862
4863struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 PyObject_HEAD
4865 unsigned char level1[32];
4866 int count2, count3;
4867 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004868};
4869
4870static PyObject*
4871encoding_map_size(PyObject *obj, PyObject* args)
4872{
4873 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004874 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004876}
4877
4878static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004879 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 PyDoc_STR("Return the size (in bytes) of this object") },
4881 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004882};
4883
4884static void
4885encoding_map_dealloc(PyObject* o)
4886{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004887 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004888}
4889
4890static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004891 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 "EncodingMap", /*tp_name*/
4893 sizeof(struct encoding_map), /*tp_basicsize*/
4894 0, /*tp_itemsize*/
4895 /* methods */
4896 encoding_map_dealloc, /*tp_dealloc*/
4897 0, /*tp_print*/
4898 0, /*tp_getattr*/
4899 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004900 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 0, /*tp_repr*/
4902 0, /*tp_as_number*/
4903 0, /*tp_as_sequence*/
4904 0, /*tp_as_mapping*/
4905 0, /*tp_hash*/
4906 0, /*tp_call*/
4907 0, /*tp_str*/
4908 0, /*tp_getattro*/
4909 0, /*tp_setattro*/
4910 0, /*tp_as_buffer*/
4911 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4912 0, /*tp_doc*/
4913 0, /*tp_traverse*/
4914 0, /*tp_clear*/
4915 0, /*tp_richcompare*/
4916 0, /*tp_weaklistoffset*/
4917 0, /*tp_iter*/
4918 0, /*tp_iternext*/
4919 encoding_map_methods, /*tp_methods*/
4920 0, /*tp_members*/
4921 0, /*tp_getset*/
4922 0, /*tp_base*/
4923 0, /*tp_dict*/
4924 0, /*tp_descr_get*/
4925 0, /*tp_descr_set*/
4926 0, /*tp_dictoffset*/
4927 0, /*tp_init*/
4928 0, /*tp_alloc*/
4929 0, /*tp_new*/
4930 0, /*tp_free*/
4931 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004932};
4933
4934PyObject*
4935PyUnicode_BuildEncodingMap(PyObject* string)
4936{
4937 Py_UNICODE *decode;
4938 PyObject *result;
4939 struct encoding_map *mresult;
4940 int i;
4941 int need_dict = 0;
4942 unsigned char level1[32];
4943 unsigned char level2[512];
4944 unsigned char *mlevel1, *mlevel2, *mlevel3;
4945 int count2 = 0, count3 = 0;
4946
4947 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4948 PyErr_BadArgument();
4949 return NULL;
4950 }
4951 decode = PyUnicode_AS_UNICODE(string);
4952 memset(level1, 0xFF, sizeof level1);
4953 memset(level2, 0xFF, sizeof level2);
4954
4955 /* If there isn't a one-to-one mapping of NULL to \0,
4956 or if there are non-BMP characters, we need to use
4957 a mapping dictionary. */
4958 if (decode[0] != 0)
4959 need_dict = 1;
4960 for (i = 1; i < 256; i++) {
4961 int l1, l2;
4962 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004963#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004964 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004965#endif
4966 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004967 need_dict = 1;
4968 break;
4969 }
4970 if (decode[i] == 0xFFFE)
4971 /* unmapped character */
4972 continue;
4973 l1 = decode[i] >> 11;
4974 l2 = decode[i] >> 7;
4975 if (level1[l1] == 0xFF)
4976 level1[l1] = count2++;
4977 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004978 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004979 }
4980
4981 if (count2 >= 0xFF || count3 >= 0xFF)
4982 need_dict = 1;
4983
4984 if (need_dict) {
4985 PyObject *result = PyDict_New();
4986 PyObject *key, *value;
4987 if (!result)
4988 return NULL;
4989 for (i = 0; i < 256; i++) {
4990 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004991 key = PyLong_FromLong(decode[i]);
4992 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004993 if (!key || !value)
4994 goto failed1;
4995 if (PyDict_SetItem(result, key, value) == -1)
4996 goto failed1;
4997 Py_DECREF(key);
4998 Py_DECREF(value);
4999 }
5000 return result;
5001 failed1:
5002 Py_XDECREF(key);
5003 Py_XDECREF(value);
5004 Py_DECREF(result);
5005 return NULL;
5006 }
5007
5008 /* Create a three-level trie */
5009 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5010 16*count2 + 128*count3 - 1);
5011 if (!result)
5012 return PyErr_NoMemory();
5013 PyObject_Init(result, &EncodingMapType);
5014 mresult = (struct encoding_map*)result;
5015 mresult->count2 = count2;
5016 mresult->count3 = count3;
5017 mlevel1 = mresult->level1;
5018 mlevel2 = mresult->level23;
5019 mlevel3 = mresult->level23 + 16*count2;
5020 memcpy(mlevel1, level1, 32);
5021 memset(mlevel2, 0xFF, 16*count2);
5022 memset(mlevel3, 0, 128*count3);
5023 count3 = 0;
5024 for (i = 1; i < 256; i++) {
5025 int o1, o2, o3, i2, i3;
5026 if (decode[i] == 0xFFFE)
5027 /* unmapped character */
5028 continue;
5029 o1 = decode[i]>>11;
5030 o2 = (decode[i]>>7) & 0xF;
5031 i2 = 16*mlevel1[o1] + o2;
5032 if (mlevel2[i2] == 0xFF)
5033 mlevel2[i2] = count3++;
5034 o3 = decode[i] & 0x7F;
5035 i3 = 128*mlevel2[i2] + o3;
5036 mlevel3[i3] = i;
5037 }
5038 return result;
5039}
5040
5041static int
5042encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5043{
5044 struct encoding_map *map = (struct encoding_map*)mapping;
5045 int l1 = c>>11;
5046 int l2 = (c>>7) & 0xF;
5047 int l3 = c & 0x7F;
5048 int i;
5049
5050#ifdef Py_UNICODE_WIDE
5051 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005053 }
5054#endif
5055 if (c == 0)
5056 return 0;
5057 /* level 1*/
5058 i = map->level1[l1];
5059 if (i == 0xFF) {
5060 return -1;
5061 }
5062 /* level 2*/
5063 i = map->level23[16*i+l2];
5064 if (i == 0xFF) {
5065 return -1;
5066 }
5067 /* level 3 */
5068 i = map->level23[16*map->count2 + 128*i + l3];
5069 if (i == 0) {
5070 return -1;
5071 }
5072 return i;
5073}
5074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005075/* Lookup the character ch in the mapping. If the character
5076 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005077 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005078static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079{
Christian Heimes217cfd12007-12-02 14:31:20 +00005080 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005081 PyObject *x;
5082
5083 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005085 x = PyObject_GetItem(mapping, w);
5086 Py_DECREF(w);
5087 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5089 /* No mapping found means: mapping is undefined. */
5090 PyErr_Clear();
5091 x = Py_None;
5092 Py_INCREF(x);
5093 return x;
5094 } else
5095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005097 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005099 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 long value = PyLong_AS_LONG(x);
5101 if (value < 0 || value > 255) {
5102 PyErr_SetString(PyExc_TypeError,
5103 "character mapping must be in range(256)");
5104 Py_DECREF(x);
5105 return NULL;
5106 }
5107 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005109 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 /* wrong return value */
5113 PyErr_Format(PyExc_TypeError,
5114 "character mapping must return integer, bytes or None, not %.400s",
5115 x->ob_type->tp_name);
5116 Py_DECREF(x);
5117 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 }
5119}
5120
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005121static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005122charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005123{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005124 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5125 /* exponentially overallocate to minimize reallocations */
5126 if (requiredsize < 2*outsize)
5127 requiredsize = 2*outsize;
5128 if (_PyBytes_Resize(outobj, requiredsize))
5129 return -1;
5130 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005131}
5132
Benjamin Peterson14339b62009-01-31 16:36:08 +00005133typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005135}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005136/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005137 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 space is available. Return a new reference to the object that
5139 was put in the output buffer, or Py_None, if the mapping was undefined
5140 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005141 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005142static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005143charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005145{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005146 PyObject *rep;
5147 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005148 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149
Christian Heimes90aa7642007-12-19 02:45:37 +00005150 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005151 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005153 if (res == -1)
5154 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 if (outsize<requiredsize)
5156 if (charmapencode_resize(outobj, outpos, requiredsize))
5157 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005158 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 outstart[(*outpos)++] = (char)res;
5160 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005161 }
5162
5163 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005166 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 Py_DECREF(rep);
5168 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005169 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 if (PyLong_Check(rep)) {
5171 Py_ssize_t requiredsize = *outpos+1;
5172 if (outsize<requiredsize)
5173 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5174 Py_DECREF(rep);
5175 return enc_EXCEPTION;
5176 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005177 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005179 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 else {
5181 const char *repchars = PyBytes_AS_STRING(rep);
5182 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5183 Py_ssize_t requiredsize = *outpos+repsize;
5184 if (outsize<requiredsize)
5185 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5186 Py_DECREF(rep);
5187 return enc_EXCEPTION;
5188 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005189 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005190 memcpy(outstart + *outpos, repchars, repsize);
5191 *outpos += repsize;
5192 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005194 Py_DECREF(rep);
5195 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196}
5197
5198/* handle an error in PyUnicode_EncodeCharmap
5199 Return 0 on success, -1 on error */
5200static
5201int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005204 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005205 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005206{
5207 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005208 Py_ssize_t repsize;
5209 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210 Py_UNICODE *uni2;
5211 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005212 Py_ssize_t collstartpos = *inpos;
5213 Py_ssize_t collendpos = *inpos+1;
5214 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005215 char *encoding = "charmap";
5216 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005217 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005219 /* find all unencodable characters */
5220 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005221 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005222 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 int res = encoding_map_lookup(p[collendpos], mapping);
5224 if (res != -1)
5225 break;
5226 ++collendpos;
5227 continue;
5228 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005229
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 rep = charmapencode_lookup(p[collendpos], mapping);
5231 if (rep==NULL)
5232 return -1;
5233 else if (rep!=Py_None) {
5234 Py_DECREF(rep);
5235 break;
5236 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005237 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 }
5240 /* cache callback name lookup
5241 * (if not done yet, i.e. it's the first error) */
5242 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 if ((errors==NULL) || (!strcmp(errors, "strict")))
5244 *known_errorHandler = 1;
5245 else if (!strcmp(errors, "replace"))
5246 *known_errorHandler = 2;
5247 else if (!strcmp(errors, "ignore"))
5248 *known_errorHandler = 3;
5249 else if (!strcmp(errors, "xmlcharrefreplace"))
5250 *known_errorHandler = 4;
5251 else
5252 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253 }
5254 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005255 case 1: /* strict */
5256 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5257 return -1;
5258 case 2: /* replace */
5259 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 x = charmapencode_output('?', mapping, res, respos);
5261 if (x==enc_EXCEPTION) {
5262 return -1;
5263 }
5264 else if (x==enc_FAILED) {
5265 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5266 return -1;
5267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005268 }
5269 /* fall through */
5270 case 3: /* ignore */
5271 *inpos = collendpos;
5272 break;
5273 case 4: /* xmlcharrefreplace */
5274 /* generate replacement (temporarily (mis)uses p) */
5275 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005276 char buffer[2+29+1+1];
5277 char *cp;
5278 sprintf(buffer, "&#%d;", (int)p[collpos]);
5279 for (cp = buffer; *cp; ++cp) {
5280 x = charmapencode_output(*cp, mapping, res, respos);
5281 if (x==enc_EXCEPTION)
5282 return -1;
5283 else if (x==enc_FAILED) {
5284 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5285 return -1;
5286 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005287 }
5288 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005289 *inpos = collendpos;
5290 break;
5291 default:
5292 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 encoding, reason, p, size, exceptionObject,
5294 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005295 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005297 if (PyBytes_Check(repunicode)) {
5298 /* Directly copy bytes result to output. */
5299 Py_ssize_t outsize = PyBytes_Size(*res);
5300 Py_ssize_t requiredsize;
5301 repsize = PyBytes_Size(repunicode);
5302 requiredsize = *respos + repsize;
5303 if (requiredsize > outsize)
5304 /* Make room for all additional bytes. */
5305 if (charmapencode_resize(res, respos, requiredsize)) {
5306 Py_DECREF(repunicode);
5307 return -1;
5308 }
5309 memcpy(PyBytes_AsString(*res) + *respos,
5310 PyBytes_AsString(repunicode), repsize);
5311 *respos += repsize;
5312 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005313 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005314 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005315 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005316 /* generate replacement */
5317 repsize = PyUnicode_GET_SIZE(repunicode);
5318 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 x = charmapencode_output(*uni2, mapping, res, respos);
5320 if (x==enc_EXCEPTION) {
5321 return -1;
5322 }
5323 else if (x==enc_FAILED) {
5324 Py_DECREF(repunicode);
5325 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5326 return -1;
5327 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005328 }
5329 *inpos = newpos;
5330 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005331 }
5332 return 0;
5333}
5334
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 Py_ssize_t size,
5337 PyObject *mapping,
5338 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 /* output object */
5341 PyObject *res = NULL;
5342 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005343 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005345 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 PyObject *errorHandler = NULL;
5347 PyObject *exc = NULL;
5348 /* the following variable is used for caching string comparisons
5349 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5350 * 3=ignore, 4=xmlcharrefreplace */
5351 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352
5353 /* Default to Latin-1 */
5354 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005357 /* allocate enough for a simple encoding without
5358 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005359 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 if (res == NULL)
5361 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005362 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 /* try to encode it */
5367 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5368 if (x==enc_EXCEPTION) /* error */
5369 goto onError;
5370 if (x==enc_FAILED) { /* unencodable character */
5371 if (charmap_encoding_error(p, size, &inpos, mapping,
5372 &exc,
5373 &known_errorHandler, &errorHandler, errors,
5374 &res, &respos)) {
5375 goto onError;
5376 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005377 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 else
5379 /* done with this character => adjust input position */
5380 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005383 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005384 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005385 if (_PyBytes_Resize(&res, respos) < 0)
5386 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005387
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005388 Py_XDECREF(exc);
5389 Py_XDECREF(errorHandler);
5390 return res;
5391
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005393 Py_XDECREF(res);
5394 Py_XDECREF(exc);
5395 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 return NULL;
5397}
5398
5399PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401{
5402 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 PyErr_BadArgument();
5404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 }
5406 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 PyUnicode_GET_SIZE(unicode),
5408 mapping,
5409 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410}
5411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005412/* create or adjust a UnicodeTranslateError */
5413static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 const Py_UNICODE *unicode, Py_ssize_t size,
5415 Py_ssize_t startpos, Py_ssize_t endpos,
5416 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005418 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005419 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 }
5422 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5424 goto onError;
5425 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5426 goto onError;
5427 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5428 goto onError;
5429 return;
5430 onError:
5431 Py_DECREF(*exceptionObject);
5432 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 }
5434}
5435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005436/* raises a UnicodeTranslateError */
5437static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 const Py_UNICODE *unicode, Py_ssize_t size,
5439 Py_ssize_t startpos, Py_ssize_t endpos,
5440 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005441{
5442 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446}
5447
5448/* error handling callback helper:
5449 build arguments, call the callback and check the arguments,
5450 put the result into newpos and return the replacement string, which
5451 has to be freed by the caller */
5452static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 PyObject **errorHandler,
5454 const char *reason,
5455 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5456 Py_ssize_t startpos, Py_ssize_t endpos,
5457 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005459 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005461 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462 PyObject *restuple;
5463 PyObject *resunicode;
5464
5465 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005467 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005469 }
5470
5471 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475
5476 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005481 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 Py_DECREF(restuple);
5483 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005484 }
5485 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 &resunicode, &i_newpos)) {
5487 Py_DECREF(restuple);
5488 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005490 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005492 else
5493 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005494 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5496 Py_DECREF(restuple);
5497 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499 Py_INCREF(resunicode);
5500 Py_DECREF(restuple);
5501 return resunicode;
5502}
5503
5504/* Lookup the character ch in the mapping and put the result in result,
5505 which must be decrefed by the caller.
5506 Return 0 on success, -1 on error */
5507static
5508int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5509{
Christian Heimes217cfd12007-12-02 14:31:20 +00005510 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 PyObject *x;
5512
5513 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 x = PyObject_GetItem(mapping, w);
5516 Py_DECREF(w);
5517 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5519 /* No mapping found means: use 1:1 mapping. */
5520 PyErr_Clear();
5521 *result = NULL;
5522 return 0;
5523 } else
5524 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005525 }
5526 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 *result = x;
5528 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005529 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005530 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 long value = PyLong_AS_LONG(x);
5532 long max = PyUnicode_GetMax();
5533 if (value < 0 || value > max) {
5534 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005535 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 Py_DECREF(x);
5537 return -1;
5538 }
5539 *result = x;
5540 return 0;
5541 }
5542 else if (PyUnicode_Check(x)) {
5543 *result = x;
5544 return 0;
5545 }
5546 else {
5547 /* wrong return value */
5548 PyErr_SetString(PyExc_TypeError,
5549 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005550 Py_DECREF(x);
5551 return -1;
5552 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005553}
5554/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 if not reallocate and adjust various state variables.
5556 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005557static
Walter Dörwald4894c302003-10-24 14:25:28 +00005558int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005562 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 /* remember old output position */
5564 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5565 /* exponentially overallocate to minimize reallocations */
5566 if (requiredsize < 2 * oldsize)
5567 requiredsize = 2 * oldsize;
5568 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5569 return -1;
5570 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005571 }
5572 return 0;
5573}
5574/* lookup the character, put the result in the output string and adjust
5575 various state variables. Return a new reference to the object that
5576 was put in the output buffer in *result, or Py_None, if the mapping was
5577 undefined (in which case no character was written).
5578 The called must decref result.
5579 Return 0 on success, -1 on error. */
5580static
Walter Dörwald4894c302003-10-24 14:25:28 +00005581int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5583 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005584{
Walter Dörwald4894c302003-10-24 14:25:28 +00005585 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 /* not found => default to 1:1 mapping */
5589 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590 }
5591 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005593 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 /* no overflow check, because we know that the space is enough */
5595 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596 }
5597 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5599 if (repsize==1) {
5600 /* no overflow check, because we know that the space is enough */
5601 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5602 }
5603 else if (repsize!=0) {
5604 /* more than one character */
5605 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5606 (insize - (curinp-startinp)) +
5607 repsize - 1;
5608 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5609 return -1;
5610 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5611 *outp += repsize;
5612 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613 }
5614 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616 return 0;
5617}
5618
5619PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 Py_ssize_t size,
5621 PyObject *mapping,
5622 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 /* output object */
5625 PyObject *res = NULL;
5626 /* pointers to the beginning and end+1 of input */
5627 const Py_UNICODE *startp = p;
5628 const Py_UNICODE *endp = p + size;
5629 /* pointer into the output */
5630 Py_UNICODE *str;
5631 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005632 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 char *reason = "character maps to <undefined>";
5634 PyObject *errorHandler = NULL;
5635 PyObject *exc = NULL;
5636 /* the following variable is used for caching string comparisons
5637 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5638 * 3=ignore, 4=xmlcharrefreplace */
5639 int known_errorHandler = -1;
5640
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 PyErr_BadArgument();
5643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645
5646 /* allocate enough for a simple 1:1 translation without
5647 replacements, if we need more, we'll resize */
5648 res = PyUnicode_FromUnicode(NULL, size);
5649 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 /* try to encode it */
5657 PyObject *x = NULL;
5658 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5659 Py_XDECREF(x);
5660 goto onError;
5661 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005662 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 if (x!=Py_None) /* it worked => adjust input pointer */
5664 ++p;
5665 else { /* untranslatable character */
5666 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5667 Py_ssize_t repsize;
5668 Py_ssize_t newpos;
5669 Py_UNICODE *uni2;
5670 /* startpos for collecting untranslatable chars */
5671 const Py_UNICODE *collstart = p;
5672 const Py_UNICODE *collend = p+1;
5673 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 /* find all untranslatable characters */
5676 while (collend < endp) {
5677 if (charmaptranslate_lookup(*collend, mapping, &x))
5678 goto onError;
5679 Py_XDECREF(x);
5680 if (x!=Py_None)
5681 break;
5682 ++collend;
5683 }
5684 /* cache callback name lookup
5685 * (if not done yet, i.e. it's the first error) */
5686 if (known_errorHandler==-1) {
5687 if ((errors==NULL) || (!strcmp(errors, "strict")))
5688 known_errorHandler = 1;
5689 else if (!strcmp(errors, "replace"))
5690 known_errorHandler = 2;
5691 else if (!strcmp(errors, "ignore"))
5692 known_errorHandler = 3;
5693 else if (!strcmp(errors, "xmlcharrefreplace"))
5694 known_errorHandler = 4;
5695 else
5696 known_errorHandler = 0;
5697 }
5698 switch (known_errorHandler) {
5699 case 1: /* strict */
5700 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005701 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 case 2: /* replace */
5703 /* No need to check for space, this is a 1:1 replacement */
5704 for (coll = collstart; coll<collend; ++coll)
5705 *str++ = '?';
5706 /* fall through */
5707 case 3: /* ignore */
5708 p = collend;
5709 break;
5710 case 4: /* xmlcharrefreplace */
5711 /* generate replacement (temporarily (mis)uses p) */
5712 for (p = collstart; p < collend; ++p) {
5713 char buffer[2+29+1+1];
5714 char *cp;
5715 sprintf(buffer, "&#%d;", (int)*p);
5716 if (charmaptranslate_makespace(&res, &str,
5717 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5718 goto onError;
5719 for (cp = buffer; *cp; ++cp)
5720 *str++ = *cp;
5721 }
5722 p = collend;
5723 break;
5724 default:
5725 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5726 reason, startp, size, &exc,
5727 collstart-startp, collend-startp, &newpos);
5728 if (repunicode == NULL)
5729 goto onError;
5730 /* generate replacement */
5731 repsize = PyUnicode_GET_SIZE(repunicode);
5732 if (charmaptranslate_makespace(&res, &str,
5733 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5734 Py_DECREF(repunicode);
5735 goto onError;
5736 }
5737 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5738 *str++ = *uni2;
5739 p = startp + newpos;
5740 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005742 }
5743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 /* Resize if we allocated to much */
5745 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005746 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 if (PyUnicode_Resize(&res, respos) < 0)
5748 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 }
5750 Py_XDECREF(exc);
5751 Py_XDECREF(errorHandler);
5752 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 Py_XDECREF(res);
5756 Py_XDECREF(exc);
5757 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 return NULL;
5759}
5760
5761PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 PyObject *mapping,
5763 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764{
5765 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005766
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 str = PyUnicode_FromObject(str);
5768 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 PyUnicode_GET_SIZE(str),
5772 mapping,
5773 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 Py_DECREF(str);
5775 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005776
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 Py_XDECREF(str);
5779 return NULL;
5780}
Tim Petersced69f82003-09-16 20:30:58 +00005781
Guido van Rossum9e896b32000-04-05 20:11:21 +00005782/* --- Decimal Encoder ---------------------------------------------------- */
5783
5784int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 Py_ssize_t length,
5786 char *output,
5787 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005788{
5789 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 PyObject *errorHandler = NULL;
5791 PyObject *exc = NULL;
5792 const char *encoding = "decimal";
5793 const char *reason = "invalid decimal Unicode string";
5794 /* the following variable is used for caching string comparisons
5795 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5796 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005797
5798 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 PyErr_BadArgument();
5800 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005801 }
5802
5803 p = s;
5804 end = s + length;
5805 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 register Py_UNICODE ch = *p;
5807 int decimal;
5808 PyObject *repunicode;
5809 Py_ssize_t repsize;
5810 Py_ssize_t newpos;
5811 Py_UNICODE *uni2;
5812 Py_UNICODE *collstart;
5813 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005814
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005816 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 ++p;
5818 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005819 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 decimal = Py_UNICODE_TODECIMAL(ch);
5821 if (decimal >= 0) {
5822 *output++ = '0' + decimal;
5823 ++p;
5824 continue;
5825 }
5826 if (0 < ch && ch < 256) {
5827 *output++ = (char)ch;
5828 ++p;
5829 continue;
5830 }
5831 /* All other characters are considered unencodable */
5832 collstart = p;
5833 collend = p+1;
5834 while (collend < end) {
5835 if ((0 < *collend && *collend < 256) ||
5836 !Py_UNICODE_ISSPACE(*collend) ||
5837 Py_UNICODE_TODECIMAL(*collend))
5838 break;
5839 }
5840 /* cache callback name lookup
5841 * (if not done yet, i.e. it's the first error) */
5842 if (known_errorHandler==-1) {
5843 if ((errors==NULL) || (!strcmp(errors, "strict")))
5844 known_errorHandler = 1;
5845 else if (!strcmp(errors, "replace"))
5846 known_errorHandler = 2;
5847 else if (!strcmp(errors, "ignore"))
5848 known_errorHandler = 3;
5849 else if (!strcmp(errors, "xmlcharrefreplace"))
5850 known_errorHandler = 4;
5851 else
5852 known_errorHandler = 0;
5853 }
5854 switch (known_errorHandler) {
5855 case 1: /* strict */
5856 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5857 goto onError;
5858 case 2: /* replace */
5859 for (p = collstart; p < collend; ++p)
5860 *output++ = '?';
5861 /* fall through */
5862 case 3: /* ignore */
5863 p = collend;
5864 break;
5865 case 4: /* xmlcharrefreplace */
5866 /* generate replacement (temporarily (mis)uses p) */
5867 for (p = collstart; p < collend; ++p)
5868 output += sprintf(output, "&#%d;", (int)*p);
5869 p = collend;
5870 break;
5871 default:
5872 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5873 encoding, reason, s, length, &exc,
5874 collstart-s, collend-s, &newpos);
5875 if (repunicode == NULL)
5876 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005877 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005878 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005879 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5880 Py_DECREF(repunicode);
5881 goto onError;
5882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 /* generate replacement */
5884 repsize = PyUnicode_GET_SIZE(repunicode);
5885 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5886 Py_UNICODE ch = *uni2;
5887 if (Py_UNICODE_ISSPACE(ch))
5888 *output++ = ' ';
5889 else {
5890 decimal = Py_UNICODE_TODECIMAL(ch);
5891 if (decimal >= 0)
5892 *output++ = '0' + decimal;
5893 else if (0 < ch && ch < 256)
5894 *output++ = (char)ch;
5895 else {
5896 Py_DECREF(repunicode);
5897 raise_encode_exception(&exc, encoding,
5898 s, length, collstart-s, collend-s, reason);
5899 goto onError;
5900 }
5901 }
5902 }
5903 p = s + newpos;
5904 Py_DECREF(repunicode);
5905 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005906 }
5907 /* 0-terminate the output string */
5908 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 Py_XDECREF(exc);
5910 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005911 return 0;
5912
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005914 Py_XDECREF(exc);
5915 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005916 return -1;
5917}
5918
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919/* --- Helpers ------------------------------------------------------------ */
5920
Eric Smith8c663262007-08-25 02:26:07 +00005921#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005922#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005923
Thomas Wouters477c8d52006-05-27 19:21:47 +00005924#include "stringlib/count.h"
5925#include "stringlib/find.h"
5926#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005927#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005928
Eric Smith5807c412008-05-11 21:00:57 +00005929#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005930#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005931#include "stringlib/localeutil.h"
5932
Thomas Wouters477c8d52006-05-27 19:21:47 +00005933/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005934#define ADJUST_INDICES(start, end, len) \
5935 if (end > len) \
5936 end = len; \
5937 else if (end < 0) { \
5938 end += len; \
5939 if (end < 0) \
5940 end = 0; \
5941 } \
5942 if (start < 0) { \
5943 start += len; \
5944 if (start < 0) \
5945 start = 0; \
5946 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005947
Martin v. Löwis18e16552006-02-15 17:27:45 +00005948Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005949 PyObject *substr,
5950 Py_ssize_t start,
5951 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005953 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954 PyUnicodeObject* str_obj;
5955 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005956
Thomas Wouters477c8d52006-05-27 19:21:47 +00005957 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5958 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005960 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5961 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 Py_DECREF(str_obj);
5963 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 }
Tim Petersced69f82003-09-16 20:30:58 +00005965
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005966 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005967 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005968 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5969 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005970 );
5971
5972 Py_DECREF(sub_obj);
5973 Py_DECREF(str_obj);
5974
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 return result;
5976}
5977
Martin v. Löwis18e16552006-02-15 17:27:45 +00005978Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005979 PyObject *sub,
5980 Py_ssize_t start,
5981 Py_ssize_t end,
5982 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005984 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005985
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005987 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005989 sub = PyUnicode_FromObject(sub);
5990 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 Py_DECREF(str);
5992 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 }
Tim Petersced69f82003-09-16 20:30:58 +00005994
Thomas Wouters477c8d52006-05-27 19:21:47 +00005995 if (direction > 0)
5996 result = stringlib_find_slice(
5997 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5998 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5999 start, end
6000 );
6001 else
6002 result = stringlib_rfind_slice(
6003 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6004 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6005 start, end
6006 );
6007
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006009 Py_DECREF(sub);
6010
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 return result;
6012}
6013
Tim Petersced69f82003-09-16 20:30:58 +00006014static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 PyUnicodeObject *substring,
6017 Py_ssize_t start,
6018 Py_ssize_t end,
6019 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 if (substring->length == 0)
6022 return 1;
6023
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006024 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 end -= substring->length;
6026 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
6029 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 if (Py_UNICODE_MATCH(self, end, substring))
6031 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 } else {
6033 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 }
6036
6037 return 0;
6038}
6039
Martin v. Löwis18e16552006-02-15 17:27:45 +00006040Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 PyObject *substr,
6042 Py_ssize_t start,
6043 Py_ssize_t end,
6044 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006046 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 str = PyUnicode_FromObject(str);
6049 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 substr = PyUnicode_FromObject(substr);
6052 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 Py_DECREF(str);
6054 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 }
Tim Petersced69f82003-09-16 20:30:58 +00006056
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 (PyUnicodeObject *)substr,
6059 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 Py_DECREF(str);
6061 Py_DECREF(substr);
6062 return result;
6063}
6064
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065/* Apply fixfct filter to the Unicode object self and return a
6066 reference to the modified object */
6067
Tim Petersced69f82003-09-16 20:30:58 +00006068static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071{
6072
6073 PyUnicodeObject *u;
6074
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006075 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006078
6079 Py_UNICODE_COPY(u->str, self->str, self->length);
6080
Tim Peters7a29bd52001-09-12 03:03:31 +00006081 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 /* fixfct should return TRUE if it modified the buffer. If
6083 FALSE, return a reference to the original buffer instead
6084 (to save space, not time) */
6085 Py_INCREF(self);
6086 Py_DECREF(u);
6087 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 }
6089 return (PyObject*) u;
6090}
6091
Tim Petersced69f82003-09-16 20:30:58 +00006092static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093int fixupper(PyUnicodeObject *self)
6094{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006095 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 Py_UNICODE *s = self->str;
6097 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006098
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006101
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 ch = Py_UNICODE_TOUPPER(*s);
6103 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 *s = ch;
6106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 s++;
6108 }
6109
6110 return status;
6111}
6112
Tim Petersced69f82003-09-16 20:30:58 +00006113static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114int fixlower(PyUnicodeObject *self)
6115{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006116 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 Py_UNICODE *s = self->str;
6118 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006122
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 ch = Py_UNICODE_TOLOWER(*s);
6124 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 *s = ch;
6127 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 s++;
6129 }
6130
6131 return status;
6132}
6133
Tim Petersced69f82003-09-16 20:30:58 +00006134static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135int fixswapcase(PyUnicodeObject *self)
6136{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006137 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 Py_UNICODE *s = self->str;
6139 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006140
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 while (len-- > 0) {
6142 if (Py_UNICODE_ISUPPER(*s)) {
6143 *s = Py_UNICODE_TOLOWER(*s);
6144 status = 1;
6145 } else if (Py_UNICODE_ISLOWER(*s)) {
6146 *s = Py_UNICODE_TOUPPER(*s);
6147 status = 1;
6148 }
6149 s++;
6150 }
6151
6152 return status;
6153}
6154
Tim Petersced69f82003-09-16 20:30:58 +00006155static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156int fixcapitalize(PyUnicodeObject *self)
6157{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006158 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006159 Py_UNICODE *s = self->str;
6160 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006161
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006162 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006164 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 *s = Py_UNICODE_TOUPPER(*s);
6166 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006168 s++;
6169 while (--len > 0) {
6170 if (Py_UNICODE_ISUPPER(*s)) {
6171 *s = Py_UNICODE_TOLOWER(*s);
6172 status = 1;
6173 }
6174 s++;
6175 }
6176 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177}
6178
6179static
6180int fixtitle(PyUnicodeObject *self)
6181{
6182 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6183 register Py_UNICODE *e;
6184 int previous_is_cased;
6185
6186 /* Shortcut for single character strings */
6187 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6189 if (*p != ch) {
6190 *p = ch;
6191 return 1;
6192 }
6193 else
6194 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 }
Tim Petersced69f82003-09-16 20:30:58 +00006196
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 e = p + PyUnicode_GET_SIZE(self);
6198 previous_is_cased = 0;
6199 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006201
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 if (previous_is_cased)
6203 *p = Py_UNICODE_TOLOWER(ch);
6204 else
6205 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006206
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 if (Py_UNICODE_ISLOWER(ch) ||
6208 Py_UNICODE_ISUPPER(ch) ||
6209 Py_UNICODE_ISTITLE(ch))
6210 previous_is_cased = 1;
6211 else
6212 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 }
6214 return 1;
6215}
6216
Tim Peters8ce9f162004-08-27 01:49:32 +00006217PyObject *
6218PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219{
Skip Montanaro6543b452004-09-16 03:28:13 +00006220 const Py_UNICODE blank = ' ';
6221 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006222 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006223 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006224 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6225 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006226 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6227 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006228 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006229 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230
Tim Peters05eba1f2004-08-27 21:32:02 +00006231 fseq = PySequence_Fast(seq, "");
6232 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006233 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006234 }
6235
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006236 /* NOTE: the following code can't call back into Python code,
6237 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006238 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006239
Tim Peters05eba1f2004-08-27 21:32:02 +00006240 seqlen = PySequence_Fast_GET_SIZE(fseq);
6241 /* If empty sequence, return u"". */
6242 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006243 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6244 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006245 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006246 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006247 /* If singleton sequence with an exact Unicode, return that. */
6248 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 item = items[0];
6250 if (PyUnicode_CheckExact(item)) {
6251 Py_INCREF(item);
6252 res = (PyUnicodeObject *)item;
6253 goto Done;
6254 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006255 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006256 else {
6257 /* Set up sep and seplen */
6258 if (separator == NULL) {
6259 sep = &blank;
6260 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006261 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006262 else {
6263 if (!PyUnicode_Check(separator)) {
6264 PyErr_Format(PyExc_TypeError,
6265 "separator: expected str instance,"
6266 " %.80s found",
6267 Py_TYPE(separator)->tp_name);
6268 goto onError;
6269 }
6270 sep = PyUnicode_AS_UNICODE(separator);
6271 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006272 }
6273 }
6274
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006275 /* There are at least two things to join, or else we have a subclass
6276 * of str in the sequence.
6277 * Do a pre-pass to figure out the total amount of space we'll
6278 * need (sz), and see whether all argument are strings.
6279 */
6280 sz = 0;
6281 for (i = 0; i < seqlen; i++) {
6282 const Py_ssize_t old_sz = sz;
6283 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 if (!PyUnicode_Check(item)) {
6285 PyErr_Format(PyExc_TypeError,
6286 "sequence item %zd: expected str instance,"
6287 " %.80s found",
6288 i, Py_TYPE(item)->tp_name);
6289 goto onError;
6290 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006291 sz += PyUnicode_GET_SIZE(item);
6292 if (i != 0)
6293 sz += seplen;
6294 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6295 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006297 goto onError;
6298 }
6299 }
Tim Petersced69f82003-09-16 20:30:58 +00006300
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006301 res = _PyUnicode_New(sz);
6302 if (res == NULL)
6303 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006304
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006305 /* Catenate everything. */
6306 res_p = PyUnicode_AS_UNICODE(res);
6307 for (i = 0; i < seqlen; ++i) {
6308 Py_ssize_t itemlen;
6309 item = items[i];
6310 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 /* Copy item, and maybe the separator. */
6312 if (i) {
6313 Py_UNICODE_COPY(res_p, sep, seplen);
6314 res_p += seplen;
6315 }
6316 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6317 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006318 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006319
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006321 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 return (PyObject *)res;
6323
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006325 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006326 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 return NULL;
6328}
6329
Tim Petersced69f82003-09-16 20:30:58 +00006330static
6331PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 Py_ssize_t left,
6333 Py_ssize_t right,
6334 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335{
6336 PyUnicodeObject *u;
6337
6338 if (left < 0)
6339 left = 0;
6340 if (right < 0)
6341 right = 0;
6342
Tim Peters7a29bd52001-09-12 03:03:31 +00006343 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 Py_INCREF(self);
6345 return self;
6346 }
6347
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006348 if (left > PY_SSIZE_T_MAX - self->length ||
6349 right > PY_SSIZE_T_MAX - (left + self->length)) {
6350 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6351 return NULL;
6352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 u = _PyUnicode_New(left + self->length + right);
6354 if (u) {
6355 if (left)
6356 Py_UNICODE_FILL(u->str, fill, left);
6357 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6358 if (right)
6359 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6360 }
6361
6362 return u;
6363}
6364
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006365PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368
6369 string = PyUnicode_FromObject(string);
6370 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006373 list = stringlib_splitlines(
6374 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6375 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376
6377 Py_DECREF(string);
6378 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379}
6380
Tim Petersced69f82003-09-16 20:30:58 +00006381static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 PyUnicodeObject *substring,
6384 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006387 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006390 return stringlib_split_whitespace(
6391 (PyObject*) self, self->str, self->length, maxcount
6392 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006394 return stringlib_split(
6395 (PyObject*) self, self->str, self->length,
6396 substring->str, substring->length,
6397 maxcount
6398 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399}
6400
Tim Petersced69f82003-09-16 20:30:58 +00006401static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006402PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 PyUnicodeObject *substring,
6404 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006405{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006406 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006407 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006408
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006409 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006410 return stringlib_rsplit_whitespace(
6411 (PyObject*) self, self->str, self->length, maxcount
6412 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006413
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006414 return stringlib_rsplit(
6415 (PyObject*) self, self->str, self->length,
6416 substring->str, substring->length,
6417 maxcount
6418 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006419}
6420
6421static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 PyUnicodeObject *str1,
6424 PyUnicodeObject *str2,
6425 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426{
6427 PyUnicodeObject *u;
6428
6429 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006431 else if (maxcount == 0 || self->length == 0)
6432 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
Thomas Wouters477c8d52006-05-27 19:21:47 +00006434 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006435 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006436 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006437 if (str1->length == 0)
6438 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006439 if (str1->length == 1) {
6440 /* replace characters */
6441 Py_UNICODE u1, u2;
6442 if (!findchar(self->str, self->length, str1->str[0]))
6443 goto nothing;
6444 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6445 if (!u)
6446 return NULL;
6447 Py_UNICODE_COPY(u->str, self->str, self->length);
6448 u1 = str1->str[0];
6449 u2 = str2->str[0];
6450 for (i = 0; i < u->length; i++)
6451 if (u->str[i] == u1) {
6452 if (--maxcount < 0)
6453 break;
6454 u->str[i] = u2;
6455 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006457 i = stringlib_find(
6458 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006460 if (i < 0)
6461 goto nothing;
6462 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6463 if (!u)
6464 return NULL;
6465 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006466
6467 /* change everything in-place, starting with this one */
6468 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6469 i += str1->length;
6470
6471 while ( --maxcount > 0) {
6472 i = stringlib_find(self->str+i, self->length-i,
6473 str1->str, str1->length,
6474 i);
6475 if (i == -1)
6476 break;
6477 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6478 i += str1->length;
6479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482
6483 Py_ssize_t n, i, j, e;
6484 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 Py_UNICODE *p;
6486
6487 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006488 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6489 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006490 if (n == 0)
6491 goto nothing;
6492 /* new_size = self->length + n * (str2->length - str1->length)); */
6493 delta = (str2->length - str1->length);
6494 if (delta == 0) {
6495 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006497 product = n * (str2->length - str1->length);
6498 if ((product / (str2->length - str1->length)) != n) {
6499 PyErr_SetString(PyExc_OverflowError,
6500 "replace string is too long");
6501 return NULL;
6502 }
6503 new_size = self->length + product;
6504 if (new_size < 0) {
6505 PyErr_SetString(PyExc_OverflowError,
6506 "replace string is too long");
6507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 }
6509 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006510 u = _PyUnicode_New(new_size);
6511 if (!u)
6512 return NULL;
6513 i = 0;
6514 p = u->str;
6515 e = self->length - str1->length;
6516 if (str1->length > 0) {
6517 while (n-- > 0) {
6518 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006519 j = stringlib_find(self->str+i, self->length-i,
6520 str1->str, str1->length,
6521 i);
6522 if (j == -1)
6523 break;
6524 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006525 /* copy unchanged part [i:j] */
6526 Py_UNICODE_COPY(p, self->str+i, j-i);
6527 p += j - i;
6528 }
6529 /* copy substitution string */
6530 if (str2->length > 0) {
6531 Py_UNICODE_COPY(p, str2->str, str2->length);
6532 p += str2->length;
6533 }
6534 i = j + str1->length;
6535 }
6536 if (i < self->length)
6537 /* copy tail [i:] */
6538 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6539 } else {
6540 /* interleave */
6541 while (n > 0) {
6542 Py_UNICODE_COPY(p, str2->str, str2->length);
6543 p += str2->length;
6544 if (--n <= 0)
6545 break;
6546 *p++ = self->str[i++];
6547 }
6548 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006552
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006554 /* nothing to replace; return original string (when possible) */
6555 if (PyUnicode_CheckExact(self)) {
6556 Py_INCREF(self);
6557 return (PyObject *) self;
6558 }
6559 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560}
6561
6562/* --- Unicode Object Methods --------------------------------------------- */
6563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006564PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566\n\
6567Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006568characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569
6570static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006571unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 return fixup(self, fixtitle);
6574}
6575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006576PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578\n\
6579Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006580have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
6582static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006583unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 return fixup(self, fixcapitalize);
6586}
6587
6588#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006589PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591\n\
6592Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006593normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594
6595static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006596unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597{
6598 PyObject *list;
6599 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006600 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 /* Split into words */
6603 list = split(self, NULL, -1);
6604 if (!list)
6605 return NULL;
6606
6607 /* Capitalize each word */
6608 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6609 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 if (item == NULL)
6612 goto onError;
6613 Py_DECREF(PyList_GET_ITEM(list, i));
6614 PyList_SET_ITEM(list, i, item);
6615 }
6616
6617 /* Join the words to form a new string */
6618 item = PyUnicode_Join(NULL, list);
6619
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 Py_DECREF(list);
6622 return (PyObject *)item;
6623}
6624#endif
6625
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006626/* Argument converter. Coerces to a single unicode character */
6627
6628static int
6629convert_uc(PyObject *obj, void *addr)
6630{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006631 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6632 PyObject *uniobj;
6633 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006634
Benjamin Peterson14339b62009-01-31 16:36:08 +00006635 uniobj = PyUnicode_FromObject(obj);
6636 if (uniobj == NULL) {
6637 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006639 return 0;
6640 }
6641 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6642 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006644 Py_DECREF(uniobj);
6645 return 0;
6646 }
6647 unistr = PyUnicode_AS_UNICODE(uniobj);
6648 *fillcharloc = unistr[0];
6649 Py_DECREF(uniobj);
6650 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006651}
6652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006653PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006656Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006657done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658
6659static PyObject *
6660unicode_center(PyUnicodeObject *self, PyObject *args)
6661{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006662 Py_ssize_t marg, left;
6663 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006664 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665
Thomas Woutersde017742006-02-16 19:34:37 +00006666 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 return NULL;
6668
Tim Peters7a29bd52001-09-12 03:03:31 +00006669 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 Py_INCREF(self);
6671 return (PyObject*) self;
6672 }
6673
6674 marg = width - self->length;
6675 left = marg / 2 + (marg & width & 1);
6676
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006677 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678}
6679
Marc-André Lemburge5034372000-08-08 08:04:29 +00006680#if 0
6681
6682/* This code should go into some future Unicode collation support
6683 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006684 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006685
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006686/* speedy UTF-16 code point order comparison */
6687/* gleaned from: */
6688/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6689
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006690static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006691{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006692 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006693 0, 0, 0, 0, 0, 0, 0, 0,
6694 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006695 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006696};
6697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698static int
6699unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6700{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006701 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006702
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 Py_UNICODE *s1 = str1->str;
6704 Py_UNICODE *s2 = str2->str;
6705
6706 len1 = str1->length;
6707 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006708
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006710 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006711
6712 c1 = *s1++;
6713 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006714
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 if (c1 > (1<<11) * 26)
6716 c1 += utf16Fixup[c1>>11];
6717 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006718 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006719 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006720
6721 if (c1 != c2)
6722 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006723
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006724 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 }
6726
6727 return (len1 < len2) ? -1 : (len1 != len2);
6728}
6729
Marc-André Lemburge5034372000-08-08 08:04:29 +00006730#else
6731
6732static int
6733unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6734{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006735 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006736
6737 Py_UNICODE *s1 = str1->str;
6738 Py_UNICODE *s2 = str2->str;
6739
6740 len1 = str1->length;
6741 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006742
Marc-André Lemburge5034372000-08-08 08:04:29 +00006743 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006744 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006745
Fredrik Lundh45714e92001-06-26 16:39:36 +00006746 c1 = *s1++;
6747 c2 = *s2++;
6748
6749 if (c1 != c2)
6750 return (c1 < c2) ? -1 : 1;
6751
Marc-André Lemburge5034372000-08-08 08:04:29 +00006752 len1--; len2--;
6753 }
6754
6755 return (len1 < len2) ? -1 : (len1 != len2);
6756}
6757
6758#endif
6759
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006763 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6764 return unicode_compare((PyUnicodeObject *)left,
6765 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006766 PyErr_Format(PyExc_TypeError,
6767 "Can't compare %.100s and %.100s",
6768 left->ob_type->tp_name,
6769 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 return -1;
6771}
6772
Martin v. Löwis5b222132007-06-10 09:51:05 +00006773int
6774PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6775{
6776 int i;
6777 Py_UNICODE *id;
6778 assert(PyUnicode_Check(uni));
6779 id = PyUnicode_AS_UNICODE(uni);
6780 /* Compare Unicode string and source character set string */
6781 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 if (id[i] != str[i])
6783 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006784 /* This check keeps Python strings that end in '\0' from comparing equal
6785 to C strings identical up to that point. */
6786 if (PyUnicode_GET_SIZE(uni) != i)
6787 /* We'll say the Python string is longer. */
6788 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006789 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006791 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006793 return 0;
6794}
6795
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006796
Benjamin Peterson29060642009-01-31 22:14:21 +00006797#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006798 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006799
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006800PyObject *PyUnicode_RichCompare(PyObject *left,
6801 PyObject *right,
6802 int op)
6803{
6804 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006805
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006806 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6807 PyObject *v;
6808 if (((PyUnicodeObject *) left)->length !=
6809 ((PyUnicodeObject *) right)->length) {
6810 if (op == Py_EQ) {
6811 Py_INCREF(Py_False);
6812 return Py_False;
6813 }
6814 if (op == Py_NE) {
6815 Py_INCREF(Py_True);
6816 return Py_True;
6817 }
6818 }
6819 if (left == right)
6820 result = 0;
6821 else
6822 result = unicode_compare((PyUnicodeObject *)left,
6823 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006824
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006825 /* Convert the return value to a Boolean */
6826 switch (op) {
6827 case Py_EQ:
6828 v = TEST_COND(result == 0);
6829 break;
6830 case Py_NE:
6831 v = TEST_COND(result != 0);
6832 break;
6833 case Py_LE:
6834 v = TEST_COND(result <= 0);
6835 break;
6836 case Py_GE:
6837 v = TEST_COND(result >= 0);
6838 break;
6839 case Py_LT:
6840 v = TEST_COND(result == -1);
6841 break;
6842 case Py_GT:
6843 v = TEST_COND(result == 1);
6844 break;
6845 default:
6846 PyErr_BadArgument();
6847 return NULL;
6848 }
6849 Py_INCREF(v);
6850 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006851 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006852
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006853 Py_INCREF(Py_NotImplemented);
6854 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006855}
6856
Guido van Rossum403d68b2000-03-13 15:55:09 +00006857int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006859{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006860 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006861 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006862
6863 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006864 sub = PyUnicode_FromObject(element);
6865 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 PyErr_Format(PyExc_TypeError,
6867 "'in <string>' requires string as left operand, not %s",
6868 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006869 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006870 }
6871
Thomas Wouters477c8d52006-05-27 19:21:47 +00006872 str = PyUnicode_FromObject(container);
6873 if (!str) {
6874 Py_DECREF(sub);
6875 return -1;
6876 }
6877
6878 result = stringlib_contains_obj(str, sub);
6879
6880 Py_DECREF(str);
6881 Py_DECREF(sub);
6882
Guido van Rossum403d68b2000-03-13 15:55:09 +00006883 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006884}
6885
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886/* Concat to string or Unicode object giving a new Unicode object. */
6887
6888PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890{
6891 PyUnicodeObject *u = NULL, *v = NULL, *w;
6892
6893 /* Coerce the two arguments */
6894 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6895 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6898 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900
6901 /* Shortcuts */
6902 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 Py_DECREF(v);
6904 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
6906 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 Py_DECREF(u);
6908 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 }
6910
6911 /* Concat the two Unicode strings */
6912 w = _PyUnicode_New(u->length + v->length);
6913 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 Py_UNICODE_COPY(w->str, u->str, u->length);
6916 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6917
6918 Py_DECREF(u);
6919 Py_DECREF(v);
6920 return (PyObject *)w;
6921
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 Py_XDECREF(u);
6924 Py_XDECREF(v);
6925 return NULL;
6926}
6927
Walter Dörwald1ab83302007-05-18 17:15:44 +00006928void
6929PyUnicode_Append(PyObject **pleft, PyObject *right)
6930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006931 PyObject *new;
6932 if (*pleft == NULL)
6933 return;
6934 if (right == NULL || !PyUnicode_Check(*pleft)) {
6935 Py_DECREF(*pleft);
6936 *pleft = NULL;
6937 return;
6938 }
6939 new = PyUnicode_Concat(*pleft, right);
6940 Py_DECREF(*pleft);
6941 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006942}
6943
6944void
6945PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6946{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006947 PyUnicode_Append(pleft, right);
6948 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006949}
6950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006951PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006954Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006955string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006956interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957
6958static PyObject *
6959unicode_count(PyUnicodeObject *self, PyObject *args)
6960{
6961 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006962 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006963 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 PyObject *result;
6965
Guido van Rossumb8872e62000-05-09 14:14:27 +00006966 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 return NULL;
6969
6970 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006971 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006974
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006975 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006976 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006977 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006978 substring->str, substring->length,
6979 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00006980 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981
6982 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006983
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 return result;
6985}
6986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006990Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006991to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006992handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006993a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6994'xmlcharrefreplace' as well as any other name registered with\n\
6995codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00006998unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007000 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 char *encoding = NULL;
7002 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007003 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007004
Benjamin Peterson308d6372009-09-18 21:42:35 +00007005 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7006 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007008 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007009 if (v == NULL)
7010 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007011 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007012 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007013 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007014 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007015 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007016 Py_DECREF(v);
7017 return NULL;
7018 }
7019 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007022 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007023}
7024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007025PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027\n\
7028Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030
7031static PyObject*
7032unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7033{
7034 Py_UNICODE *e;
7035 Py_UNICODE *p;
7036 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007037 Py_UNICODE *qe;
7038 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 PyUnicodeObject *u;
7040 int tabsize = 8;
7041
7042 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044
Thomas Wouters7e474022000-07-16 12:04:32 +00007045 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007046 i = 0; /* chars up to and including most recent \n or \r */
7047 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7048 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 for (p = self->str; p < e; p++)
7050 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 if (tabsize > 0) {
7052 incr = tabsize - (j % tabsize); /* cannot overflow */
7053 if (j > PY_SSIZE_T_MAX - incr)
7054 goto overflow1;
7055 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 if (j > PY_SSIZE_T_MAX - 1)
7060 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 j++;
7062 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 if (i > PY_SSIZE_T_MAX - j)
7064 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007066 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 }
7068 }
7069
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007070 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007072
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 /* Second pass: create output string and fill it */
7074 u = _PyUnicode_New(i + j);
7075 if (!u)
7076 return NULL;
7077
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007078 j = 0; /* same as in first pass */
7079 q = u->str; /* next output char */
7080 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081
7082 for (p = self->str; p < e; p++)
7083 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 if (tabsize > 0) {
7085 i = tabsize - (j % tabsize);
7086 j += i;
7087 while (i--) {
7088 if (q >= qe)
7089 goto overflow2;
7090 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 else {
7095 if (q >= qe)
7096 goto overflow2;
7097 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007098 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 if (*p == '\n' || *p == '\r')
7100 j = 0;
7101 }
7102
7103 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007104
7105 overflow2:
7106 Py_DECREF(u);
7107 overflow1:
7108 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110}
7111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007112PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114\n\
7115Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007116such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117arguments start and end are interpreted as in slice notation.\n\
7118\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007119Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
7121static PyObject *
7122unicode_find(PyUnicodeObject *self, PyObject *args)
7123{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007125 Py_ssize_t start;
7126 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007127 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128
Christian Heimes9cd17752007-11-18 19:35:23 +00007129 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131
Thomas Wouters477c8d52006-05-27 19:21:47 +00007132 result = stringlib_find_slice(
7133 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7134 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7135 start, end
7136 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137
7138 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007139
Christian Heimes217cfd12007-12-02 14:31:20 +00007140 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141}
7142
7143static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007144unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145{
7146 if (index < 0 || index >= self->length) {
7147 PyErr_SetString(PyExc_IndexError, "string index out of range");
7148 return NULL;
7149 }
7150
7151 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7152}
7153
Guido van Rossumc2504932007-09-18 19:42:40 +00007154/* Believe it or not, this produces the same value for ASCII strings
7155 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007157unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158{
Guido van Rossumc2504932007-09-18 19:42:40 +00007159 Py_ssize_t len;
7160 Py_UNICODE *p;
7161 long x;
7162
7163 if (self->hash != -1)
7164 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007165 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007166 p = self->str;
7167 x = *p << 7;
7168 while (--len >= 0)
7169 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007170 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007171 if (x == -1)
7172 x = -2;
7173 self->hash = x;
7174 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175}
7176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007177PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007180Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181
7182static PyObject *
7183unicode_index(PyUnicodeObject *self, PyObject *args)
7184{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007185 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007186 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007187 Py_ssize_t start;
7188 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189
Christian Heimes9cd17752007-11-18 19:35:23 +00007190 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
Thomas Wouters477c8d52006-05-27 19:21:47 +00007193 result = stringlib_find_slice(
7194 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7195 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7196 start, end
7197 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198
7199 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007200
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 if (result < 0) {
7202 PyErr_SetString(PyExc_ValueError, "substring not found");
7203 return NULL;
7204 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007205
Christian Heimes217cfd12007-12-02 14:31:20 +00007206 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207}
7208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007209PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007212Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007213at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214
7215static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007216unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217{
7218 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7219 register const Py_UNICODE *e;
7220 int cased;
7221
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 /* Shortcut for single character strings */
7223 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007226 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007227 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007228 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007229
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230 e = p + PyUnicode_GET_SIZE(self);
7231 cased = 0;
7232 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007234
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7236 return PyBool_FromLong(0);
7237 else if (!cased && Py_UNICODE_ISLOWER(ch))
7238 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007240 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241}
7242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007243PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007246Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007247at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
7249static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007250unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251{
7252 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7253 register const Py_UNICODE *e;
7254 int cased;
7255
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 /* Shortcut for single character strings */
7257 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007260 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007261 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007263
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 e = p + PyUnicode_GET_SIZE(self);
7265 cased = 0;
7266 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007268
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7270 return PyBool_FromLong(0);
7271 else if (!cased && Py_UNICODE_ISUPPER(ch))
7272 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007274 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275}
7276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007277PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007280Return True if S is a titlecased string and there is at least one\n\
7281character in S, i.e. upper- and titlecase characters may only\n\
7282follow uncased characters and lowercase characters only cased ones.\n\
7283Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
7285static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007286unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287{
7288 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7289 register const Py_UNICODE *e;
7290 int cased, previous_is_cased;
7291
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 /* Shortcut for single character strings */
7293 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7295 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007297 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007298 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007300
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 e = p + PyUnicode_GET_SIZE(self);
7302 cased = 0;
7303 previous_is_cased = 0;
7304 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007306
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7308 if (previous_is_cased)
7309 return PyBool_FromLong(0);
7310 previous_is_cased = 1;
7311 cased = 1;
7312 }
7313 else if (Py_UNICODE_ISLOWER(ch)) {
7314 if (!previous_is_cased)
7315 return PyBool_FromLong(0);
7316 previous_is_cased = 1;
7317 cased = 1;
7318 }
7319 else
7320 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007322 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323}
7324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007325PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007328Return True if all characters in S are whitespace\n\
7329and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330
7331static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007332unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333{
7334 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7335 register const Py_UNICODE *e;
7336
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 /* Shortcut for single character strings */
7338 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 Py_UNICODE_ISSPACE(*p))
7340 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007342 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007343 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007345
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 e = p + PyUnicode_GET_SIZE(self);
7347 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 if (!Py_UNICODE_ISSPACE(*p))
7349 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007351 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352}
7353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007354PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007356\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007357Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007358and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007359
7360static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007361unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007362{
7363 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7364 register const Py_UNICODE *e;
7365
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007366 /* Shortcut for single character strings */
7367 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 Py_UNICODE_ISALPHA(*p))
7369 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007370
7371 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007372 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007374
7375 e = p + PyUnicode_GET_SIZE(self);
7376 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 if (!Py_UNICODE_ISALPHA(*p))
7378 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007379 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007380 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007381}
7382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007383PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007385\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007386Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007387and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007388
7389static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007390unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007391{
7392 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7393 register const Py_UNICODE *e;
7394
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007395 /* Shortcut for single character strings */
7396 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 Py_UNICODE_ISALNUM(*p))
7398 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007399
7400 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007401 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007403
7404 e = p + PyUnicode_GET_SIZE(self);
7405 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 if (!Py_UNICODE_ISALNUM(*p))
7407 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007408 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007409 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007410}
7411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007412PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007415Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007416False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
7418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007419unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420{
7421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7422 register const Py_UNICODE *e;
7423
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 /* Shortcut for single character strings */
7425 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 Py_UNICODE_ISDECIMAL(*p))
7427 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007429 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007430 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007432
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433 e = p + PyUnicode_GET_SIZE(self);
7434 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 if (!Py_UNICODE_ISDECIMAL(*p))
7436 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007438 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439}
7440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007444Return True if all characters in S are digits\n\
7445and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007448unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
7450 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7451 register const Py_UNICODE *e;
7452
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 /* Shortcut for single character strings */
7454 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_UNICODE_ISDIGIT(*p))
7456 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007458 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007459 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007461
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 e = p + PyUnicode_GET_SIZE(self);
7463 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 if (!Py_UNICODE_ISDIGIT(*p))
7465 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007467 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468}
7469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007473Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007474False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475
7476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007477unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478{
7479 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7480 register const Py_UNICODE *e;
7481
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 /* Shortcut for single character strings */
7483 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 Py_UNICODE_ISNUMERIC(*p))
7485 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007487 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007488 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007490
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 e = p + PyUnicode_GET_SIZE(self);
7492 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 if (!Py_UNICODE_ISNUMERIC(*p))
7494 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007496 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497}
7498
Martin v. Löwis47383402007-08-15 07:32:56 +00007499int
7500PyUnicode_IsIdentifier(PyObject *self)
7501{
7502 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7503 register const Py_UNICODE *e;
7504
7505 /* Special case for empty strings */
7506 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007508
7509 /* PEP 3131 says that the first character must be in
7510 XID_Start and subsequent characters in XID_Continue,
7511 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007512 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007513 letters, digits, underscore). However, given the current
7514 definition of XID_Start and XID_Continue, it is sufficient
7515 to check just for these, except that _ must be allowed
7516 as starting an identifier. */
7517 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7518 return 0;
7519
7520 e = p + PyUnicode_GET_SIZE(self);
7521 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 if (!_PyUnicode_IsXidContinue(*p))
7523 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007524 }
7525 return 1;
7526}
7527
7528PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007530\n\
7531Return True if S is a valid identifier according\n\
7532to the language definition.");
7533
7534static PyObject*
7535unicode_isidentifier(PyObject *self)
7536{
7537 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7538}
7539
Georg Brandl559e5d72008-06-11 18:37:52 +00007540PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007542\n\
7543Return True if all characters in S are considered\n\
7544printable in repr() or S is empty, False otherwise.");
7545
7546static PyObject*
7547unicode_isprintable(PyObject *self)
7548{
7549 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7550 register const Py_UNICODE *e;
7551
7552 /* Shortcut for single character strings */
7553 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7554 Py_RETURN_TRUE;
7555 }
7556
7557 e = p + PyUnicode_GET_SIZE(self);
7558 for (; p < e; p++) {
7559 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7560 Py_RETURN_FALSE;
7561 }
7562 }
7563 Py_RETURN_TRUE;
7564}
7565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007567 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568\n\
7569Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007570iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
7572static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007573unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007575 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576}
7577
Martin v. Löwis18e16552006-02-15 17:27:45 +00007578static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579unicode_length(PyUnicodeObject *self)
7580{
7581 return self->length;
7582}
7583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007584PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007587Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007588done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589
7590static PyObject *
7591unicode_ljust(PyUnicodeObject *self, PyObject *args)
7592{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007593 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007594 Py_UNICODE fillchar = ' ';
7595
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007596 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 return NULL;
7598
Tim Peters7a29bd52001-09-12 03:03:31 +00007599 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 Py_INCREF(self);
7601 return (PyObject*) self;
7602 }
7603
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007604 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605}
7606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007607PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007610Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611
7612static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007613unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 return fixup(self, fixlower);
7616}
7617
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007618#define LEFTSTRIP 0
7619#define RIGHTSTRIP 1
7620#define BOTHSTRIP 2
7621
7622/* Arrays indexed by above */
7623static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7624
7625#define STRIPNAME(i) (stripformat[i]+3)
7626
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007627/* externally visible for str.strip(unicode) */
7628PyObject *
7629_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7630{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007631 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7632 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7633 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7634 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7635 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007636
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007638
Benjamin Peterson14339b62009-01-31 16:36:08 +00007639 i = 0;
7640 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7642 i++;
7643 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007644 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007645
Benjamin Peterson14339b62009-01-31 16:36:08 +00007646 j = len;
7647 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 do {
7649 j--;
7650 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7651 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007652 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007653
Benjamin Peterson14339b62009-01-31 16:36:08 +00007654 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 Py_INCREF(self);
7656 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007657 }
7658 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007660}
7661
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662
7663static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007664do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007666 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7667 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007668
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669 i = 0;
7670 if (striptype != RIGHTSTRIP) {
7671 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7672 i++;
7673 }
7674 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007675
Benjamin Peterson14339b62009-01-31 16:36:08 +00007676 j = len;
7677 if (striptype != LEFTSTRIP) {
7678 do {
7679 j--;
7680 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7681 j++;
7682 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007683
Benjamin Peterson14339b62009-01-31 16:36:08 +00007684 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7685 Py_INCREF(self);
7686 return (PyObject*)self;
7687 }
7688 else
7689 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690}
7691
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007692
7693static PyObject *
7694do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7695{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007696 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007697
Benjamin Peterson14339b62009-01-31 16:36:08 +00007698 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7699 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007700
Benjamin Peterson14339b62009-01-31 16:36:08 +00007701 if (sep != NULL && sep != Py_None) {
7702 if (PyUnicode_Check(sep))
7703 return _PyUnicode_XStrip(self, striptype, sep);
7704 else {
7705 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 "%s arg must be None or str",
7707 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007708 return NULL;
7709 }
7710 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007711
Benjamin Peterson14339b62009-01-31 16:36:08 +00007712 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007713}
7714
7715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007716PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007718\n\
7719Return a copy of the string S with leading and trailing\n\
7720whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007721If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007722
7723static PyObject *
7724unicode_strip(PyUnicodeObject *self, PyObject *args)
7725{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007726 if (PyTuple_GET_SIZE(args) == 0)
7727 return do_strip(self, BOTHSTRIP); /* Common case */
7728 else
7729 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007730}
7731
7732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007733PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007735\n\
7736Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007737If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007738
7739static PyObject *
7740unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7741{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007742 if (PyTuple_GET_SIZE(args) == 0)
7743 return do_strip(self, LEFTSTRIP); /* Common case */
7744 else
7745 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007746}
7747
7748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007749PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007751\n\
7752Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007753If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007754
7755static PyObject *
7756unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7757{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007758 if (PyTuple_GET_SIZE(args) == 0)
7759 return do_strip(self, RIGHTSTRIP); /* Common case */
7760 else
7761 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007762}
7763
7764
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007766unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767{
7768 PyUnicodeObject *u;
7769 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007770 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007771 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772
Georg Brandl222de0f2009-04-12 12:01:50 +00007773 if (len < 1) {
7774 Py_INCREF(unicode_empty);
7775 return (PyObject *)unicode_empty;
7776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777
Tim Peters7a29bd52001-09-12 03:03:31 +00007778 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 /* no repeat, return original string */
7780 Py_INCREF(str);
7781 return (PyObject*) str;
7782 }
Tim Peters8f422462000-09-09 06:13:41 +00007783
7784 /* ensure # of chars needed doesn't overflow int and # of bytes
7785 * needed doesn't overflow size_t
7786 */
7787 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007788 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007789 PyErr_SetString(PyExc_OverflowError,
7790 "repeated string is too long");
7791 return NULL;
7792 }
7793 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7794 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7795 PyErr_SetString(PyExc_OverflowError,
7796 "repeated string is too long");
7797 return NULL;
7798 }
7799 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 if (!u)
7801 return NULL;
7802
7803 p = u->str;
7804
Georg Brandl222de0f2009-04-12 12:01:50 +00007805 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007806 Py_UNICODE_FILL(p, str->str[0], len);
7807 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007808 Py_ssize_t done = str->length; /* number of characters copied this far */
7809 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007811 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007812 Py_UNICODE_COPY(p+done, p, n);
7813 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 }
7816
7817 return (PyObject*) u;
7818}
7819
7820PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 PyObject *subobj,
7822 PyObject *replobj,
7823 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824{
7825 PyObject *self;
7826 PyObject *str1;
7827 PyObject *str2;
7828 PyObject *result;
7829
7830 self = PyUnicode_FromObject(obj);
7831 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 str1 = PyUnicode_FromObject(subobj);
7834 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 Py_DECREF(self);
7836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 }
7838 str2 = PyUnicode_FromObject(replobj);
7839 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 Py_DECREF(self);
7841 Py_DECREF(str1);
7842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 }
Tim Petersced69f82003-09-16 20:30:58 +00007844 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 (PyUnicodeObject *)str1,
7846 (PyUnicodeObject *)str2,
7847 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 Py_DECREF(self);
7849 Py_DECREF(str1);
7850 Py_DECREF(str2);
7851 return result;
7852}
7853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007854PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856\n\
7857Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007858old replaced by new. If the optional argument count is\n\
7859given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860
7861static PyObject*
7862unicode_replace(PyUnicodeObject *self, PyObject *args)
7863{
7864 PyUnicodeObject *str1;
7865 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007866 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 PyObject *result;
7868
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 return NULL;
7871 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7872 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007875 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 Py_DECREF(str1);
7877 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879
7880 result = replace(self, str1, str2, maxcount);
7881
7882 Py_DECREF(str1);
7883 Py_DECREF(str2);
7884 return result;
7885}
7886
7887static
7888PyObject *unicode_repr(PyObject *unicode)
7889{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007890 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007891 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007892 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7893 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7894
7895 /* XXX(nnorwitz): rather than over-allocating, it would be
7896 better to choose a different scheme. Perhaps scan the
7897 first N-chars of the string and allocate based on that size.
7898 */
7899 /* Initial allocation is based on the longest-possible unichr
7900 escape.
7901
7902 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7903 unichr, so in this case it's the longest unichr escape. In
7904 narrow (UTF-16) builds this is five chars per source unichr
7905 since there are two unichrs in the surrogate pair, so in narrow
7906 (UTF-16) builds it's not the longest unichr escape.
7907
7908 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7909 so in the narrow (UTF-16) build case it's the longest unichr
7910 escape.
7911 */
7912
Walter Dörwald1ab83302007-05-18 17:15:44 +00007913 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007915#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007917#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007919#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007921 if (repr == NULL)
7922 return NULL;
7923
Walter Dörwald1ab83302007-05-18 17:15:44 +00007924 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007925
7926 /* Add quote */
7927 *p++ = (findchar(s, size, '\'') &&
7928 !findchar(s, size, '"')) ? '"' : '\'';
7929 while (size-- > 0) {
7930 Py_UNICODE ch = *s++;
7931
7932 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007933 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007934 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007935 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007936 continue;
7937 }
7938
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007940 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007941 *p++ = '\\';
7942 *p++ = 't';
7943 }
7944 else if (ch == '\n') {
7945 *p++ = '\\';
7946 *p++ = 'n';
7947 }
7948 else if (ch == '\r') {
7949 *p++ = '\\';
7950 *p++ = 'r';
7951 }
7952
7953 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007954 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007955 *p++ = '\\';
7956 *p++ = 'x';
7957 *p++ = hexdigits[(ch >> 4) & 0x000F];
7958 *p++ = hexdigits[ch & 0x000F];
7959 }
7960
Georg Brandl559e5d72008-06-11 18:37:52 +00007961 /* Copy ASCII characters as-is */
7962 else if (ch < 0x7F) {
7963 *p++ = ch;
7964 }
7965
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007967 else {
7968 Py_UCS4 ucs = ch;
7969
7970#ifndef Py_UNICODE_WIDE
7971 Py_UNICODE ch2 = 0;
7972 /* Get code point from surrogate pair */
7973 if (size > 0) {
7974 ch2 = *s;
7975 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007977 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007979 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007980 size--;
7981 }
7982 }
7983#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007984 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007985 (categories Z* and C* except ASCII space)
7986 */
7987 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7988 /* Map 8-bit characters to '\xhh' */
7989 if (ucs <= 0xff) {
7990 *p++ = '\\';
7991 *p++ = 'x';
7992 *p++ = hexdigits[(ch >> 4) & 0x000F];
7993 *p++ = hexdigits[ch & 0x000F];
7994 }
7995 /* Map 21-bit characters to '\U00xxxxxx' */
7996 else if (ucs >= 0x10000) {
7997 *p++ = '\\';
7998 *p++ = 'U';
7999 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8000 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8001 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8002 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8003 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8004 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8005 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8006 *p++ = hexdigits[ucs & 0x0000000F];
8007 }
8008 /* Map 16-bit characters to '\uxxxx' */
8009 else {
8010 *p++ = '\\';
8011 *p++ = 'u';
8012 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8013 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8014 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8015 *p++ = hexdigits[ucs & 0x000F];
8016 }
8017 }
8018 /* Copy characters as-is */
8019 else {
8020 *p++ = ch;
8021#ifndef Py_UNICODE_WIDE
8022 if (ucs >= 0x10000)
8023 *p++ = ch2;
8024#endif
8025 }
8026 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008027 }
8028 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008029 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008030
8031 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008032 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008033 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034}
8035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008036PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038\n\
8039Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008040such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041arguments start and end are interpreted as in slice notation.\n\
8042\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008043Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044
8045static PyObject *
8046unicode_rfind(PyUnicodeObject *self, PyObject *args)
8047{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008048 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008049 Py_ssize_t start;
8050 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008051 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052
Christian Heimes9cd17752007-11-18 19:35:23 +00008053 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055
Thomas Wouters477c8d52006-05-27 19:21:47 +00008056 result = stringlib_rfind_slice(
8057 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8058 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8059 start, end
8060 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061
8062 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008063
Christian Heimes217cfd12007-12-02 14:31:20 +00008064 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065}
8066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008067PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008070Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071
8072static PyObject *
8073unicode_rindex(PyUnicodeObject *self, PyObject *args)
8074{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008075 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008076 Py_ssize_t start;
8077 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008078 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079
Christian Heimes9cd17752007-11-18 19:35:23 +00008080 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082
Thomas Wouters477c8d52006-05-27 19:21:47 +00008083 result = stringlib_rfind_slice(
8084 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8085 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8086 start, end
8087 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088
8089 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008090
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 if (result < 0) {
8092 PyErr_SetString(PyExc_ValueError, "substring not found");
8093 return NULL;
8094 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008095 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096}
8097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008098PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008101Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008102done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103
8104static PyObject *
8105unicode_rjust(PyUnicodeObject *self, PyObject *args)
8106{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008107 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008108 Py_UNICODE fillchar = ' ';
8109
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008110 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 return NULL;
8112
Tim Peters7a29bd52001-09-12 03:03:31 +00008113 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 Py_INCREF(self);
8115 return (PyObject*) self;
8116 }
8117
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008118 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119}
8120
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 PyObject *sep,
8123 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124{
8125 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008126
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 s = PyUnicode_FromObject(s);
8128 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008129 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 if (sep != NULL) {
8131 sep = PyUnicode_FromObject(sep);
8132 if (sep == NULL) {
8133 Py_DECREF(s);
8134 return NULL;
8135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 }
8137
8138 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8139
8140 Py_DECREF(s);
8141 Py_XDECREF(sep);
8142 return result;
8143}
8144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008145PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147\n\
8148Return a list of the words in S, using sep as the\n\
8149delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008150splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008151whitespace string is a separator and empty strings are\n\
8152removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153
8154static PyObject*
8155unicode_split(PyUnicodeObject *self, PyObject *args)
8156{
8157 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008158 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159
Martin v. Löwis18e16552006-02-15 17:27:45 +00008160 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 return NULL;
8162
8163 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169}
8170
Thomas Wouters477c8d52006-05-27 19:21:47 +00008171PyObject *
8172PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8173{
8174 PyObject* str_obj;
8175 PyObject* sep_obj;
8176 PyObject* out;
8177
8178 str_obj = PyUnicode_FromObject(str_in);
8179 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008181 sep_obj = PyUnicode_FromObject(sep_in);
8182 if (!sep_obj) {
8183 Py_DECREF(str_obj);
8184 return NULL;
8185 }
8186
8187 out = stringlib_partition(
8188 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8189 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8190 );
8191
8192 Py_DECREF(sep_obj);
8193 Py_DECREF(str_obj);
8194
8195 return out;
8196}
8197
8198
8199PyObject *
8200PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8201{
8202 PyObject* str_obj;
8203 PyObject* sep_obj;
8204 PyObject* out;
8205
8206 str_obj = PyUnicode_FromObject(str_in);
8207 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209 sep_obj = PyUnicode_FromObject(sep_in);
8210 if (!sep_obj) {
8211 Py_DECREF(str_obj);
8212 return NULL;
8213 }
8214
8215 out = stringlib_rpartition(
8216 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8217 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8218 );
8219
8220 Py_DECREF(sep_obj);
8221 Py_DECREF(str_obj);
8222
8223 return out;
8224}
8225
8226PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008228\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008229Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008230the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008231found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008232
8233static PyObject*
8234unicode_partition(PyUnicodeObject *self, PyObject *separator)
8235{
8236 return PyUnicode_Partition((PyObject *)self, separator);
8237}
8238
8239PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008240 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008241\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008242Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008243the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008244separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008245
8246static PyObject*
8247unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8248{
8249 return PyUnicode_RPartition((PyObject *)self, separator);
8250}
8251
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008252PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 PyObject *sep,
8254 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008255{
8256 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008257
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008258 s = PyUnicode_FromObject(s);
8259 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008260 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 if (sep != NULL) {
8262 sep = PyUnicode_FromObject(sep);
8263 if (sep == NULL) {
8264 Py_DECREF(s);
8265 return NULL;
8266 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008267 }
8268
8269 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8270
8271 Py_DECREF(s);
8272 Py_XDECREF(sep);
8273 return result;
8274}
8275
8276PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008278\n\
8279Return a list of the words in S, using sep as the\n\
8280delimiter string, starting at the end of the string and\n\
8281working to the front. If maxsplit is given, at most maxsplit\n\
8282splits are done. If sep is not specified, any whitespace string\n\
8283is a separator.");
8284
8285static PyObject*
8286unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8287{
8288 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008289 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008290
Martin v. Löwis18e16552006-02-15 17:27:45 +00008291 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008292 return NULL;
8293
8294 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008296 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008298 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008300}
8301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008302PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304\n\
8305Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008306Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008307is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308
8309static PyObject*
8310unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8311{
Guido van Rossum86662912000-04-11 15:38:46 +00008312 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313
Guido van Rossum86662912000-04-11 15:38:46 +00008314 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 return NULL;
8316
Guido van Rossum86662912000-04-11 15:38:46 +00008317 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318}
8319
8320static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008321PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322{
Walter Dörwald346737f2007-05-31 10:44:43 +00008323 if (PyUnicode_CheckExact(self)) {
8324 Py_INCREF(self);
8325 return self;
8326 } else
8327 /* Subtype -- return genuine unicode string with the same value. */
8328 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8329 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330}
8331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008332PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334\n\
8335Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008336and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337
8338static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008339unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 return fixup(self, fixswapcase);
8342}
8343
Georg Brandlceee0772007-11-27 23:48:05 +00008344PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008346\n\
8347Return a translation table usable for str.translate().\n\
8348If there is only one argument, it must be a dictionary mapping Unicode\n\
8349ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008350Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008351If there are two arguments, they must be strings of equal length, and\n\
8352in the resulting dictionary, each character in x will be mapped to the\n\
8353character at the same position in y. If there is a third argument, it\n\
8354must be a string, whose characters will be mapped to None in the result.");
8355
8356static PyObject*
8357unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8358{
8359 PyObject *x, *y = NULL, *z = NULL;
8360 PyObject *new = NULL, *key, *value;
8361 Py_ssize_t i = 0;
8362 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008363
Georg Brandlceee0772007-11-27 23:48:05 +00008364 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8365 return NULL;
8366 new = PyDict_New();
8367 if (!new)
8368 return NULL;
8369 if (y != NULL) {
8370 /* x must be a string too, of equal length */
8371 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8372 if (!PyUnicode_Check(x)) {
8373 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8374 "be a string if there is a second argument");
8375 goto err;
8376 }
8377 if (PyUnicode_GET_SIZE(x) != ylen) {
8378 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8379 "arguments must have equal length");
8380 goto err;
8381 }
8382 /* create entries for translating chars in x to those in y */
8383 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008384 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8385 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008386 if (!key || !value)
8387 goto err;
8388 res = PyDict_SetItem(new, key, value);
8389 Py_DECREF(key);
8390 Py_DECREF(value);
8391 if (res < 0)
8392 goto err;
8393 }
8394 /* create entries for deleting chars in z */
8395 if (z != NULL) {
8396 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008397 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008398 if (!key)
8399 goto err;
8400 res = PyDict_SetItem(new, key, Py_None);
8401 Py_DECREF(key);
8402 if (res < 0)
8403 goto err;
8404 }
8405 }
8406 } else {
8407 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008408 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008409 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8410 "to maketrans it must be a dict");
8411 goto err;
8412 }
8413 /* copy entries into the new dict, converting string keys to int keys */
8414 while (PyDict_Next(x, &i, &key, &value)) {
8415 if (PyUnicode_Check(key)) {
8416 /* convert string keys to integer keys */
8417 PyObject *newkey;
8418 if (PyUnicode_GET_SIZE(key) != 1) {
8419 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8420 "table must be of length 1");
8421 goto err;
8422 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008423 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008424 if (!newkey)
8425 goto err;
8426 res = PyDict_SetItem(new, newkey, value);
8427 Py_DECREF(newkey);
8428 if (res < 0)
8429 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008430 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008431 /* just keep integer keys */
8432 if (PyDict_SetItem(new, key, value) < 0)
8433 goto err;
8434 } else {
8435 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8436 "be strings or integers");
8437 goto err;
8438 }
8439 }
8440 }
8441 return new;
8442 err:
8443 Py_DECREF(new);
8444 return NULL;
8445}
8446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008447PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449\n\
8450Return a copy of the string S, where all characters have been mapped\n\
8451through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008452Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008453Unmapped characters are left untouched. Characters mapped to None\n\
8454are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455
8456static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008457unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458{
Georg Brandlceee0772007-11-27 23:48:05 +00008459 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460}
8461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008462PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008465Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466
8467static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008468unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 return fixup(self, fixupper);
8471}
8472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008473PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008476Pad a numeric string S with zeros on the left, to fill a field\n\
8477of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478
8479static PyObject *
8480unicode_zfill(PyUnicodeObject *self, PyObject *args)
8481{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008482 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 PyUnicodeObject *u;
8484
Martin v. Löwis18e16552006-02-15 17:27:45 +00008485 Py_ssize_t width;
8486 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 return NULL;
8488
8489 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008490 if (PyUnicode_CheckExact(self)) {
8491 Py_INCREF(self);
8492 return (PyObject*) self;
8493 }
8494 else
8495 return PyUnicode_FromUnicode(
8496 PyUnicode_AS_UNICODE(self),
8497 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 }
8500
8501 fill = width - self->length;
8502
8503 u = pad(self, fill, 0, '0');
8504
Walter Dörwald068325e2002-04-15 13:36:47 +00008505 if (u == NULL)
8506 return NULL;
8507
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 if (u->str[fill] == '+' || u->str[fill] == '-') {
8509 /* move sign to beginning of string */
8510 u->str[0] = u->str[fill];
8511 u->str[fill] = '0';
8512 }
8513
8514 return (PyObject*) u;
8515}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516
8517#if 0
8518static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008519unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520{
Christian Heimes2202f872008-02-06 14:31:34 +00008521 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522}
8523#endif
8524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008525PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008528Return True if S starts with the specified prefix, False otherwise.\n\
8529With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008530With optional end, stop comparing S at that position.\n\
8531prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532
8533static PyObject *
8534unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008537 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008539 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008540 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008541 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008543 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8545 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008546 if (PyTuple_Check(subobj)) {
8547 Py_ssize_t i;
8548 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8549 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008551 if (substring == NULL)
8552 return NULL;
8553 result = tailmatch(self, substring, start, end, -1);
8554 Py_DECREF(substring);
8555 if (result) {
8556 Py_RETURN_TRUE;
8557 }
8558 }
8559 /* nothing matched */
8560 Py_RETURN_FALSE;
8561 }
8562 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008565 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008567 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568}
8569
8570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008571PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008574Return True if S ends with the specified suffix, False otherwise.\n\
8575With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008576With optional end, stop comparing S at that position.\n\
8577suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578
8579static PyObject *
8580unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008583 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008585 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008586 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008587 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008589 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8591 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008592 if (PyTuple_Check(subobj)) {
8593 Py_ssize_t i;
8594 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8595 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008597 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008599 result = tailmatch(self, substring, start, end, +1);
8600 Py_DECREF(substring);
8601 if (result) {
8602 Py_RETURN_TRUE;
8603 }
8604 }
8605 Py_RETURN_FALSE;
8606 }
8607 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008611 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008613 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614}
8615
Eric Smith8c663262007-08-25 02:26:07 +00008616#include "stringlib/string_format.h"
8617
8618PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008620\n\
8621");
8622
Eric Smith4a7d76d2008-05-30 18:10:19 +00008623static PyObject *
8624unicode__format__(PyObject* self, PyObject* args)
8625{
8626 PyObject *format_spec;
8627
8628 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8629 return NULL;
8630
8631 return _PyUnicode_FormatAdvanced(self,
8632 PyUnicode_AS_UNICODE(format_spec),
8633 PyUnicode_GET_SIZE(format_spec));
8634}
8635
Eric Smith8c663262007-08-25 02:26:07 +00008636PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008638\n\
8639");
8640
8641static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008642unicode__sizeof__(PyUnicodeObject *v)
8643{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008644 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8645 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008646}
8647
8648PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008650
8651static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008652unicode_getnewargs(PyUnicodeObject *v)
8653{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008654 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008655}
8656
8657
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658static PyMethodDef unicode_methods[] = {
8659
8660 /* Order is according to common usage: often used methods should
8661 appear first, since lookup is done sequentially. */
8662
Benjamin Peterson308d6372009-09-18 21:42:35 +00008663 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008664 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8665 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008666 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008667 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8668 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8669 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8670 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8671 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8672 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8673 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008674 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008675 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8676 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8677 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008678 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008679 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8680 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8681 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008682 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008683 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008684 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008685 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008686 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8687 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8688 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8689 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8690 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8691 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8692 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8693 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8694 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8695 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8696 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8697 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8698 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8699 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008700 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008701 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008702 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008703 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008704 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008705 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8706 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008707 {"maketrans", (PyCFunction) unicode_maketrans,
8708 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008709 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008710#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008711 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712#endif
8713
8714#if 0
8715 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008716 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717#endif
8718
Benjamin Peterson14339b62009-01-31 16:36:08 +00008719 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 {NULL, NULL}
8721};
8722
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008723static PyObject *
8724unicode_mod(PyObject *v, PyObject *w)
8725{
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 if (!PyUnicode_Check(v)) {
8727 Py_INCREF(Py_NotImplemented);
8728 return Py_NotImplemented;
8729 }
8730 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008731}
8732
8733static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008734 0, /*nb_add*/
8735 0, /*nb_subtract*/
8736 0, /*nb_multiply*/
8737 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008738};
8739
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008741 (lenfunc) unicode_length, /* sq_length */
8742 PyUnicode_Concat, /* sq_concat */
8743 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8744 (ssizeargfunc) unicode_getitem, /* sq_item */
8745 0, /* sq_slice */
8746 0, /* sq_ass_item */
8747 0, /* sq_ass_slice */
8748 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749};
8750
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008751static PyObject*
8752unicode_subscript(PyUnicodeObject* self, PyObject* item)
8753{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008754 if (PyIndex_Check(item)) {
8755 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008756 if (i == -1 && PyErr_Occurred())
8757 return NULL;
8758 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008759 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008760 return unicode_getitem(self, i);
8761 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008762 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008763 Py_UNICODE* source_buf;
8764 Py_UNICODE* result_buf;
8765 PyObject* result;
8766
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008767 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008769 return NULL;
8770 }
8771
8772 if (slicelength <= 0) {
8773 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008774 } else if (start == 0 && step == 1 && slicelength == self->length &&
8775 PyUnicode_CheckExact(self)) {
8776 Py_INCREF(self);
8777 return (PyObject *)self;
8778 } else if (step == 1) {
8779 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008780 } else {
8781 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008782 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8783 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008784
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 if (result_buf == NULL)
8786 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008787
8788 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8789 result_buf[i] = source_buf[cur];
8790 }
Tim Petersced69f82003-09-16 20:30:58 +00008791
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008792 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008793 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008794 return result;
8795 }
8796 } else {
8797 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8798 return NULL;
8799 }
8800}
8801
8802static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008803 (lenfunc)unicode_length, /* mp_length */
8804 (binaryfunc)unicode_subscript, /* mp_subscript */
8805 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008806};
8807
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809/* Helpers for PyUnicode_Format() */
8810
8811static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008812getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008814 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008815 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 (*p_argidx)++;
8817 if (arglen < 0)
8818 return args;
8819 else
8820 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 }
8822 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824 return NULL;
8825}
8826
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008827/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008829static PyObject *
8830formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008832 char *p;
8833 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008835
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 x = PyFloat_AsDouble(v);
8837 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008838 return NULL;
8839
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008842
Eric Smith0923d1d2009-04-16 20:16:10 +00008843 p = PyOS_double_to_string(x, type, prec,
8844 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008845 if (p == NULL)
8846 return NULL;
8847 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008848 PyMem_Free(p);
8849 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850}
8851
Tim Peters38fd5b62000-09-21 05:43:11 +00008852static PyObject*
8853formatlong(PyObject *val, int flags, int prec, int type)
8854{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008855 char *buf;
8856 int len;
8857 PyObject *str; /* temporary string object. */
8858 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008859
Benjamin Peterson14339b62009-01-31 16:36:08 +00008860 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8861 if (!str)
8862 return NULL;
8863 result = PyUnicode_FromStringAndSize(buf, len);
8864 Py_DECREF(str);
8865 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008866}
8867
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868static int
8869formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008870 size_t buflen,
8871 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008873 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008874 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 if (PyUnicode_GET_SIZE(v) == 1) {
8876 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8877 buf[1] = '\0';
8878 return 1;
8879 }
8880#ifndef Py_UNICODE_WIDE
8881 if (PyUnicode_GET_SIZE(v) == 2) {
8882 /* Decode a valid surrogate pair */
8883 int c0 = PyUnicode_AS_UNICODE(v)[0];
8884 int c1 = PyUnicode_AS_UNICODE(v)[1];
8885 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8886 0xDC00 <= c1 && c1 <= 0xDFFF) {
8887 buf[0] = c0;
8888 buf[1] = c1;
8889 buf[2] = '\0';
8890 return 2;
8891 }
8892 }
8893#endif
8894 goto onError;
8895 }
8896 else {
8897 /* Integer input truncated to a character */
8898 long x;
8899 x = PyLong_AsLong(v);
8900 if (x == -1 && PyErr_Occurred())
8901 goto onError;
8902
8903 if (x < 0 || x > 0x10ffff) {
8904 PyErr_SetString(PyExc_OverflowError,
8905 "%c arg not in range(0x110000)");
8906 return -1;
8907 }
8908
8909#ifndef Py_UNICODE_WIDE
8910 if (x > 0xffff) {
8911 x -= 0x10000;
8912 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8913 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8914 return 2;
8915 }
8916#endif
8917 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008918 buf[1] = '\0';
8919 return 1;
8920 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008921
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008923 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008925 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926}
8927
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008928/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008929 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008930*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008931#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008932
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935{
8936 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008937 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 int args_owned = 0;
8939 PyUnicodeObject *result = NULL;
8940 PyObject *dict = NULL;
8941 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008942
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 PyErr_BadInternalCall();
8945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 }
8947 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008948 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950 fmt = PyUnicode_AS_UNICODE(uformat);
8951 fmtcnt = PyUnicode_GET_SIZE(uformat);
8952
8953 reslen = rescnt = fmtcnt + 100;
8954 result = _PyUnicode_New(reslen);
8955 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 res = PyUnicode_AS_UNICODE(result);
8958
8959 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 arglen = PyTuple_Size(args);
8961 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 }
8963 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 arglen = -1;
8965 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008967 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008968 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970
8971 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 if (*fmt != '%') {
8973 if (--rescnt < 0) {
8974 rescnt = fmtcnt + 100;
8975 reslen += rescnt;
8976 if (_PyUnicode_Resize(&result, reslen) < 0)
8977 goto onError;
8978 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8979 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008982 }
8983 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 /* Got a format specifier */
8985 int flags = 0;
8986 Py_ssize_t width = -1;
8987 int prec = -1;
8988 Py_UNICODE c = '\0';
8989 Py_UNICODE fill;
8990 int isnumok;
8991 PyObject *v = NULL;
8992 PyObject *temp = NULL;
8993 Py_UNICODE *pbuf;
8994 Py_UNICODE sign;
8995 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008996 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 fmt++;
8999 if (*fmt == '(') {
9000 Py_UNICODE *keystart;
9001 Py_ssize_t keylen;
9002 PyObject *key;
9003 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009004
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 if (dict == NULL) {
9006 PyErr_SetString(PyExc_TypeError,
9007 "format requires a mapping");
9008 goto onError;
9009 }
9010 ++fmt;
9011 --fmtcnt;
9012 keystart = fmt;
9013 /* Skip over balanced parentheses */
9014 while (pcount > 0 && --fmtcnt >= 0) {
9015 if (*fmt == ')')
9016 --pcount;
9017 else if (*fmt == '(')
9018 ++pcount;
9019 fmt++;
9020 }
9021 keylen = fmt - keystart - 1;
9022 if (fmtcnt < 0 || pcount > 0) {
9023 PyErr_SetString(PyExc_ValueError,
9024 "incomplete format key");
9025 goto onError;
9026 }
9027#if 0
9028 /* keys are converted to strings using UTF-8 and
9029 then looked up since Python uses strings to hold
9030 variables names etc. in its namespaces and we
9031 wouldn't want to break common idioms. */
9032 key = PyUnicode_EncodeUTF8(keystart,
9033 keylen,
9034 NULL);
9035#else
9036 key = PyUnicode_FromUnicode(keystart, keylen);
9037#endif
9038 if (key == NULL)
9039 goto onError;
9040 if (args_owned) {
9041 Py_DECREF(args);
9042 args_owned = 0;
9043 }
9044 args = PyObject_GetItem(dict, key);
9045 Py_DECREF(key);
9046 if (args == NULL) {
9047 goto onError;
9048 }
9049 args_owned = 1;
9050 arglen = -1;
9051 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009052 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 while (--fmtcnt >= 0) {
9054 switch (c = *fmt++) {
9055 case '-': flags |= F_LJUST; continue;
9056 case '+': flags |= F_SIGN; continue;
9057 case ' ': flags |= F_BLANK; continue;
9058 case '#': flags |= F_ALT; continue;
9059 case '0': flags |= F_ZERO; continue;
9060 }
9061 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009062 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 if (c == '*') {
9064 v = getnextarg(args, arglen, &argidx);
9065 if (v == NULL)
9066 goto onError;
9067 if (!PyLong_Check(v)) {
9068 PyErr_SetString(PyExc_TypeError,
9069 "* wants int");
9070 goto onError;
9071 }
9072 width = PyLong_AsLong(v);
9073 if (width == -1 && PyErr_Occurred())
9074 goto onError;
9075 if (width < 0) {
9076 flags |= F_LJUST;
9077 width = -width;
9078 }
9079 if (--fmtcnt >= 0)
9080 c = *fmt++;
9081 }
9082 else if (c >= '0' && c <= '9') {
9083 width = c - '0';
9084 while (--fmtcnt >= 0) {
9085 c = *fmt++;
9086 if (c < '0' || c > '9')
9087 break;
9088 if ((width*10) / 10 != width) {
9089 PyErr_SetString(PyExc_ValueError,
9090 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009091 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 }
9093 width = width*10 + (c - '0');
9094 }
9095 }
9096 if (c == '.') {
9097 prec = 0;
9098 if (--fmtcnt >= 0)
9099 c = *fmt++;
9100 if (c == '*') {
9101 v = getnextarg(args, arglen, &argidx);
9102 if (v == NULL)
9103 goto onError;
9104 if (!PyLong_Check(v)) {
9105 PyErr_SetString(PyExc_TypeError,
9106 "* wants int");
9107 goto onError;
9108 }
9109 prec = PyLong_AsLong(v);
9110 if (prec == -1 && PyErr_Occurred())
9111 goto onError;
9112 if (prec < 0)
9113 prec = 0;
9114 if (--fmtcnt >= 0)
9115 c = *fmt++;
9116 }
9117 else if (c >= '0' && c <= '9') {
9118 prec = c - '0';
9119 while (--fmtcnt >= 0) {
9120 c = Py_CHARMASK(*fmt++);
9121 if (c < '0' || c > '9')
9122 break;
9123 if ((prec*10) / 10 != prec) {
9124 PyErr_SetString(PyExc_ValueError,
9125 "prec too big");
9126 goto onError;
9127 }
9128 prec = prec*10 + (c - '0');
9129 }
9130 }
9131 } /* prec */
9132 if (fmtcnt >= 0) {
9133 if (c == 'h' || c == 'l' || c == 'L') {
9134 if (--fmtcnt >= 0)
9135 c = *fmt++;
9136 }
9137 }
9138 if (fmtcnt < 0) {
9139 PyErr_SetString(PyExc_ValueError,
9140 "incomplete format");
9141 goto onError;
9142 }
9143 if (c != '%') {
9144 v = getnextarg(args, arglen, &argidx);
9145 if (v == NULL)
9146 goto onError;
9147 }
9148 sign = 0;
9149 fill = ' ';
9150 switch (c) {
9151
9152 case '%':
9153 pbuf = formatbuf;
9154 /* presume that buffer length is at least 1 */
9155 pbuf[0] = '%';
9156 len = 1;
9157 break;
9158
9159 case 's':
9160 case 'r':
9161 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009162 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 temp = v;
9164 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009165 }
9166 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 if (c == 's')
9168 temp = PyObject_Str(v);
9169 else if (c == 'r')
9170 temp = PyObject_Repr(v);
9171 else
9172 temp = PyObject_ASCII(v);
9173 if (temp == NULL)
9174 goto onError;
9175 if (PyUnicode_Check(temp))
9176 /* nothing to do */;
9177 else {
9178 Py_DECREF(temp);
9179 PyErr_SetString(PyExc_TypeError,
9180 "%s argument has non-string str()");
9181 goto onError;
9182 }
9183 }
9184 pbuf = PyUnicode_AS_UNICODE(temp);
9185 len = PyUnicode_GET_SIZE(temp);
9186 if (prec >= 0 && len > prec)
9187 len = prec;
9188 break;
9189
9190 case 'i':
9191 case 'd':
9192 case 'u':
9193 case 'o':
9194 case 'x':
9195 case 'X':
9196 if (c == 'i')
9197 c = 'd';
9198 isnumok = 0;
9199 if (PyNumber_Check(v)) {
9200 PyObject *iobj=NULL;
9201
9202 if (PyLong_Check(v)) {
9203 iobj = v;
9204 Py_INCREF(iobj);
9205 }
9206 else {
9207 iobj = PyNumber_Long(v);
9208 }
9209 if (iobj!=NULL) {
9210 if (PyLong_Check(iobj)) {
9211 isnumok = 1;
9212 temp = formatlong(iobj, flags, prec, c);
9213 Py_DECREF(iobj);
9214 if (!temp)
9215 goto onError;
9216 pbuf = PyUnicode_AS_UNICODE(temp);
9217 len = PyUnicode_GET_SIZE(temp);
9218 sign = 1;
9219 }
9220 else {
9221 Py_DECREF(iobj);
9222 }
9223 }
9224 }
9225 if (!isnumok) {
9226 PyErr_Format(PyExc_TypeError,
9227 "%%%c format: a number is required, "
9228 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9229 goto onError;
9230 }
9231 if (flags & F_ZERO)
9232 fill = '0';
9233 break;
9234
9235 case 'e':
9236 case 'E':
9237 case 'f':
9238 case 'F':
9239 case 'g':
9240 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009241 temp = formatfloat(v, flags, prec, c);
9242 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009244 pbuf = PyUnicode_AS_UNICODE(temp);
9245 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 sign = 1;
9247 if (flags & F_ZERO)
9248 fill = '0';
9249 break;
9250
9251 case 'c':
9252 pbuf = formatbuf;
9253 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9254 if (len < 0)
9255 goto onError;
9256 break;
9257
9258 default:
9259 PyErr_Format(PyExc_ValueError,
9260 "unsupported format character '%c' (0x%x) "
9261 "at index %zd",
9262 (31<=c && c<=126) ? (char)c : '?',
9263 (int)c,
9264 (Py_ssize_t)(fmt - 1 -
9265 PyUnicode_AS_UNICODE(uformat)));
9266 goto onError;
9267 }
9268 if (sign) {
9269 if (*pbuf == '-' || *pbuf == '+') {
9270 sign = *pbuf++;
9271 len--;
9272 }
9273 else if (flags & F_SIGN)
9274 sign = '+';
9275 else if (flags & F_BLANK)
9276 sign = ' ';
9277 else
9278 sign = 0;
9279 }
9280 if (width < len)
9281 width = len;
9282 if (rescnt - (sign != 0) < width) {
9283 reslen -= rescnt;
9284 rescnt = width + fmtcnt + 100;
9285 reslen += rescnt;
9286 if (reslen < 0) {
9287 Py_XDECREF(temp);
9288 PyErr_NoMemory();
9289 goto onError;
9290 }
9291 if (_PyUnicode_Resize(&result, reslen) < 0) {
9292 Py_XDECREF(temp);
9293 goto onError;
9294 }
9295 res = PyUnicode_AS_UNICODE(result)
9296 + reslen - rescnt;
9297 }
9298 if (sign) {
9299 if (fill != ' ')
9300 *res++ = sign;
9301 rescnt--;
9302 if (width > len)
9303 width--;
9304 }
9305 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9306 assert(pbuf[0] == '0');
9307 assert(pbuf[1] == c);
9308 if (fill != ' ') {
9309 *res++ = *pbuf++;
9310 *res++ = *pbuf++;
9311 }
9312 rescnt -= 2;
9313 width -= 2;
9314 if (width < 0)
9315 width = 0;
9316 len -= 2;
9317 }
9318 if (width > len && !(flags & F_LJUST)) {
9319 do {
9320 --rescnt;
9321 *res++ = fill;
9322 } while (--width > len);
9323 }
9324 if (fill == ' ') {
9325 if (sign)
9326 *res++ = sign;
9327 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9328 assert(pbuf[0] == '0');
9329 assert(pbuf[1] == c);
9330 *res++ = *pbuf++;
9331 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009332 }
9333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 Py_UNICODE_COPY(res, pbuf, len);
9335 res += len;
9336 rescnt -= len;
9337 while (--width >= len) {
9338 --rescnt;
9339 *res++ = ' ';
9340 }
9341 if (dict && (argidx < arglen) && c != '%') {
9342 PyErr_SetString(PyExc_TypeError,
9343 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009344 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 goto onError;
9346 }
9347 Py_XDECREF(temp);
9348 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 } /* until end */
9350 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 PyErr_SetString(PyExc_TypeError,
9352 "not all arguments converted during string formatting");
9353 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 }
9355
Thomas Woutersa96affe2006-03-12 00:29:36 +00009356 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009357 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 }
9361 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 return (PyObject *)result;
9363
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 Py_XDECREF(result);
9366 Py_DECREF(uformat);
9367 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
9370 return NULL;
9371}
9372
Jeremy Hylton938ace62002-07-17 16:30:39 +00009373static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009374unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9375
Tim Peters6d6c1a32001-08-02 04:15:00 +00009376static PyObject *
9377unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9378{
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009380 static char *kwlist[] = {"object", "encoding", "errors", 0};
9381 char *encoding = NULL;
9382 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009383
Benjamin Peterson14339b62009-01-31 16:36:08 +00009384 if (type != &PyUnicode_Type)
9385 return unicode_subtype_new(type, args, kwds);
9386 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009388 return NULL;
9389 if (x == NULL)
9390 return (PyObject *)_PyUnicode_New(0);
9391 if (encoding == NULL && errors == NULL)
9392 return PyObject_Str(x);
9393 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009395}
9396
Guido van Rossume023fe02001-08-30 03:12:59 +00009397static PyObject *
9398unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9399{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009400 PyUnicodeObject *tmp, *pnew;
9401 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009402
Benjamin Peterson14339b62009-01-31 16:36:08 +00009403 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9404 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9405 if (tmp == NULL)
9406 return NULL;
9407 assert(PyUnicode_Check(tmp));
9408 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9409 if (pnew == NULL) {
9410 Py_DECREF(tmp);
9411 return NULL;
9412 }
9413 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9414 if (pnew->str == NULL) {
9415 _Py_ForgetReference((PyObject *)pnew);
9416 PyObject_Del(pnew);
9417 Py_DECREF(tmp);
9418 return PyErr_NoMemory();
9419 }
9420 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9421 pnew->length = n;
9422 pnew->hash = tmp->hash;
9423 Py_DECREF(tmp);
9424 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009425}
9426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009427PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009428 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009429\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009430Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009431encoding defaults to the current default string encoding.\n\
9432errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009433
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009434static PyObject *unicode_iter(PyObject *seq);
9435
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009437 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009438 "str", /* tp_name */
9439 sizeof(PyUnicodeObject), /* tp_size */
9440 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009442 (destructor)unicode_dealloc, /* tp_dealloc */
9443 0, /* tp_print */
9444 0, /* tp_getattr */
9445 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009446 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009447 unicode_repr, /* tp_repr */
9448 &unicode_as_number, /* tp_as_number */
9449 &unicode_as_sequence, /* tp_as_sequence */
9450 &unicode_as_mapping, /* tp_as_mapping */
9451 (hashfunc) unicode_hash, /* tp_hash*/
9452 0, /* tp_call*/
9453 (reprfunc) unicode_str, /* tp_str */
9454 PyObject_GenericGetAttr, /* tp_getattro */
9455 0, /* tp_setattro */
9456 0, /* tp_as_buffer */
9457 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009459 unicode_doc, /* tp_doc */
9460 0, /* tp_traverse */
9461 0, /* tp_clear */
9462 PyUnicode_RichCompare, /* tp_richcompare */
9463 0, /* tp_weaklistoffset */
9464 unicode_iter, /* tp_iter */
9465 0, /* tp_iternext */
9466 unicode_methods, /* tp_methods */
9467 0, /* tp_members */
9468 0, /* tp_getset */
9469 &PyBaseObject_Type, /* tp_base */
9470 0, /* tp_dict */
9471 0, /* tp_descr_get */
9472 0, /* tp_descr_set */
9473 0, /* tp_dictoffset */
9474 0, /* tp_init */
9475 0, /* tp_alloc */
9476 unicode_new, /* tp_new */
9477 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478};
9479
9480/* Initialize the Unicode implementation */
9481
Thomas Wouters78890102000-07-22 19:25:51 +00009482void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009484 int i;
9485
Thomas Wouters477c8d52006-05-27 19:21:47 +00009486 /* XXX - move this array to unicodectype.c ? */
9487 Py_UNICODE linebreak[] = {
9488 0x000A, /* LINE FEED */
9489 0x000D, /* CARRIAGE RETURN */
9490 0x001C, /* FILE SEPARATOR */
9491 0x001D, /* GROUP SEPARATOR */
9492 0x001E, /* RECORD SEPARATOR */
9493 0x0085, /* NEXT LINE */
9494 0x2028, /* LINE SEPARATOR */
9495 0x2029, /* PARAGRAPH SEPARATOR */
9496 };
9497
Fred Drakee4315f52000-05-09 19:53:39 +00009498 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009499 free_list = NULL;
9500 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009502 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009504
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009505 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009507 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009509
9510 /* initialize the linebreak bloom filter */
9511 bloom_linebreak = make_bloom_mask(
9512 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9513 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009514
9515 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516}
9517
9518/* Finalize the Unicode implementation */
9519
Christian Heimesa156e092008-02-16 07:38:31 +00009520int
9521PyUnicode_ClearFreeList(void)
9522{
9523 int freelist_size = numfree;
9524 PyUnicodeObject *u;
9525
9526 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 PyUnicodeObject *v = u;
9528 u = *(PyUnicodeObject **)u;
9529 if (v->str)
9530 PyObject_DEL(v->str);
9531 Py_XDECREF(v->defenc);
9532 PyObject_Del(v);
9533 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009534 }
9535 free_list = NULL;
9536 assert(numfree == 0);
9537 return freelist_size;
9538}
9539
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540void
Thomas Wouters78890102000-07-22 19:25:51 +00009541_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009543 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009545 Py_XDECREF(unicode_empty);
9546 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009547
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009548 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 if (unicode_latin1[i]) {
9550 Py_DECREF(unicode_latin1[i]);
9551 unicode_latin1[i] = NULL;
9552 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009553 }
Christian Heimesa156e092008-02-16 07:38:31 +00009554 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009556
Walter Dörwald16807132007-05-25 13:52:07 +00009557void
9558PyUnicode_InternInPlace(PyObject **p)
9559{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009560 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9561 PyObject *t;
9562 if (s == NULL || !PyUnicode_Check(s))
9563 Py_FatalError(
9564 "PyUnicode_InternInPlace: unicode strings only please!");
9565 /* If it's a subclass, we don't really know what putting
9566 it in the interned dict might do. */
9567 if (!PyUnicode_CheckExact(s))
9568 return;
9569 if (PyUnicode_CHECK_INTERNED(s))
9570 return;
9571 if (interned == NULL) {
9572 interned = PyDict_New();
9573 if (interned == NULL) {
9574 PyErr_Clear(); /* Don't leave an exception */
9575 return;
9576 }
9577 }
9578 /* It might be that the GetItem call fails even
9579 though the key is present in the dictionary,
9580 namely when this happens during a stack overflow. */
9581 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009583 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009584
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 if (t) {
9586 Py_INCREF(t);
9587 Py_DECREF(*p);
9588 *p = t;
9589 return;
9590 }
Walter Dörwald16807132007-05-25 13:52:07 +00009591
Benjamin Peterson14339b62009-01-31 16:36:08 +00009592 PyThreadState_GET()->recursion_critical = 1;
9593 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9594 PyErr_Clear();
9595 PyThreadState_GET()->recursion_critical = 0;
9596 return;
9597 }
9598 PyThreadState_GET()->recursion_critical = 0;
9599 /* The two references in interned are not counted by refcnt.
9600 The deallocator will take care of this */
9601 Py_REFCNT(s) -= 2;
9602 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009603}
9604
9605void
9606PyUnicode_InternImmortal(PyObject **p)
9607{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009608 PyUnicode_InternInPlace(p);
9609 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9610 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9611 Py_INCREF(*p);
9612 }
Walter Dörwald16807132007-05-25 13:52:07 +00009613}
9614
9615PyObject *
9616PyUnicode_InternFromString(const char *cp)
9617{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009618 PyObject *s = PyUnicode_FromString(cp);
9619 if (s == NULL)
9620 return NULL;
9621 PyUnicode_InternInPlace(&s);
9622 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009623}
9624
9625void _Py_ReleaseInternedUnicodeStrings(void)
9626{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009627 PyObject *keys;
9628 PyUnicodeObject *s;
9629 Py_ssize_t i, n;
9630 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009631
Benjamin Peterson14339b62009-01-31 16:36:08 +00009632 if (interned == NULL || !PyDict_Check(interned))
9633 return;
9634 keys = PyDict_Keys(interned);
9635 if (keys == NULL || !PyList_Check(keys)) {
9636 PyErr_Clear();
9637 return;
9638 }
Walter Dörwald16807132007-05-25 13:52:07 +00009639
Benjamin Peterson14339b62009-01-31 16:36:08 +00009640 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9641 detector, interned unicode strings are not forcibly deallocated;
9642 rather, we give them their stolen references back, and then clear
9643 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009644
Benjamin Peterson14339b62009-01-31 16:36:08 +00009645 n = PyList_GET_SIZE(keys);
9646 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009648 for (i = 0; i < n; i++) {
9649 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9650 switch (s->state) {
9651 case SSTATE_NOT_INTERNED:
9652 /* XXX Shouldn't happen */
9653 break;
9654 case SSTATE_INTERNED_IMMORTAL:
9655 Py_REFCNT(s) += 1;
9656 immortal_size += s->length;
9657 break;
9658 case SSTATE_INTERNED_MORTAL:
9659 Py_REFCNT(s) += 2;
9660 mortal_size += s->length;
9661 break;
9662 default:
9663 Py_FatalError("Inconsistent interned string state.");
9664 }
9665 s->state = SSTATE_NOT_INTERNED;
9666 }
9667 fprintf(stderr, "total size of all interned strings: "
9668 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9669 "mortal/immortal\n", mortal_size, immortal_size);
9670 Py_DECREF(keys);
9671 PyDict_Clear(interned);
9672 Py_DECREF(interned);
9673 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009674}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009675
9676
9677/********************* Unicode Iterator **************************/
9678
9679typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009680 PyObject_HEAD
9681 Py_ssize_t it_index;
9682 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009683} unicodeiterobject;
9684
9685static void
9686unicodeiter_dealloc(unicodeiterobject *it)
9687{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009688 _PyObject_GC_UNTRACK(it);
9689 Py_XDECREF(it->it_seq);
9690 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009691}
9692
9693static int
9694unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9695{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009696 Py_VISIT(it->it_seq);
9697 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009698}
9699
9700static PyObject *
9701unicodeiter_next(unicodeiterobject *it)
9702{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009703 PyUnicodeObject *seq;
9704 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009705
Benjamin Peterson14339b62009-01-31 16:36:08 +00009706 assert(it != NULL);
9707 seq = it->it_seq;
9708 if (seq == NULL)
9709 return NULL;
9710 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009711
Benjamin Peterson14339b62009-01-31 16:36:08 +00009712 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9713 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009714 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009715 if (item != NULL)
9716 ++it->it_index;
9717 return item;
9718 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009719
Benjamin Peterson14339b62009-01-31 16:36:08 +00009720 Py_DECREF(seq);
9721 it->it_seq = NULL;
9722 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009723}
9724
9725static PyObject *
9726unicodeiter_len(unicodeiterobject *it)
9727{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009728 Py_ssize_t len = 0;
9729 if (it->it_seq)
9730 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9731 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009732}
9733
9734PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9735
9736static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009737 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009739 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009740};
9741
9742PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9744 "str_iterator", /* tp_name */
9745 sizeof(unicodeiterobject), /* tp_basicsize */
9746 0, /* tp_itemsize */
9747 /* methods */
9748 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9749 0, /* tp_print */
9750 0, /* tp_getattr */
9751 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009752 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009753 0, /* tp_repr */
9754 0, /* tp_as_number */
9755 0, /* tp_as_sequence */
9756 0, /* tp_as_mapping */
9757 0, /* tp_hash */
9758 0, /* tp_call */
9759 0, /* tp_str */
9760 PyObject_GenericGetAttr, /* tp_getattro */
9761 0, /* tp_setattro */
9762 0, /* tp_as_buffer */
9763 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9764 0, /* tp_doc */
9765 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9766 0, /* tp_clear */
9767 0, /* tp_richcompare */
9768 0, /* tp_weaklistoffset */
9769 PyObject_SelfIter, /* tp_iter */
9770 (iternextfunc)unicodeiter_next, /* tp_iternext */
9771 unicodeiter_methods, /* tp_methods */
9772 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009773};
9774
9775static PyObject *
9776unicode_iter(PyObject *seq)
9777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009778 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009779
Benjamin Peterson14339b62009-01-31 16:36:08 +00009780 if (!PyUnicode_Check(seq)) {
9781 PyErr_BadInternalCall();
9782 return NULL;
9783 }
9784 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9785 if (it == NULL)
9786 return NULL;
9787 it->it_index = 0;
9788 Py_INCREF(seq);
9789 it->it_seq = (PyUnicodeObject *)seq;
9790 _PyObject_GC_TRACK(it);
9791 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009792}
9793
Martin v. Löwis5b222132007-06-10 09:51:05 +00009794size_t
9795Py_UNICODE_strlen(const Py_UNICODE *u)
9796{
9797 int res = 0;
9798 while(*u++)
9799 res++;
9800 return res;
9801}
9802
9803Py_UNICODE*
9804Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9805{
9806 Py_UNICODE *u = s1;
9807 while ((*u++ = *s2++));
9808 return s1;
9809}
9810
9811Py_UNICODE*
9812Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9813{
9814 Py_UNICODE *u = s1;
9815 while ((*u++ = *s2++))
9816 if (n-- == 0)
9817 break;
9818 return s1;
9819}
9820
9821int
9822Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9823{
9824 while (*s1 && *s2 && *s1 == *s2)
9825 s1++, s2++;
9826 if (*s1 && *s2)
9827 return (*s1 < *s2) ? -1 : +1;
9828 if (*s1)
9829 return 1;
9830 if (*s2)
9831 return -1;
9832 return 0;
9833}
9834
9835Py_UNICODE*
9836Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9837{
9838 const Py_UNICODE *p;
9839 for (p = s; *p; p++)
9840 if (*p == c)
9841 return (Py_UNICODE*)p;
9842 return NULL;
9843}
9844
9845
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009846#ifdef __cplusplus
9847}
9848#endif